v3.8.1.1

v3.8.1
2025-09-17 23:44:27 +00:00 · 2018-02-09 23:30:14 -05:00 · 2018-02-07 16:38:45 -05:00
85 changed files with 5426 additions and 3985 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -100,7 +100,8 @@ cpuminer_SOURCES = \
  algo/lbry.c \
  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
-  algo/luffa/sse2/luffa_for_sse2.c \
+  algo/luffa/luffa_for_sse2.c \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/lyra2rev2-gate.c \
@@ -127,7 +128,11 @@ cpuminer_SOURCES = \
  algo/quark/anime-gate.c \
  algo/quark/anime.c \
  algo/quark/anime-4way.c \
  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
  algo/qubit/qubit-2way.c \
  algo/qubit/deep-gate.c \
  algo/qubit/deep-2way.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
  algo/scrypt.c \
@@ -143,8 +148,9 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite.c \
  algo/simd/sph_simd.c \
-  algo/simd/sse2/nist.c \
+  algo/simd/nist.c \
-  algo/simd/sse2/vector.c \
+  algo/simd/vector.c \
  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ See file RELEASE_NOTES for change log and compile instructions.
 Supported Algorithms
 --------------------
                          anime        Animecoin
                          argon2
                          axiom        Shabal-256 MemoHash
                          bastion
@@ -78,6 +79,7 @@ Supported Algorithms
                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
                          x16r         Ravencoin
                          x17
                          xevan        Bitsend
                          yescrypt     Globalboost-Y (BSTY)
@@ -136,10 +138,13 @@ output from the miner showing the startup and any errors.
 Donations
 ---------
-I do not do this for money but I have a donation address if users
+cpuminer-opt has no fees of any kind but donations are accepted.
 are so inclined.
-bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
 ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
 LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
 BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
 BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
 Happy mining!
--- a/README.txt
+++ b/README.txt
@@ -25,3 +25,12 @@ cpuminer-aes-avx.exe   "-march=corei7-avx"         Sandybridge, Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
 cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"    Ryzen
 If you like this software feel free to donate:
 BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
 ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
 LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
 BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
 BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
--- a/20
+++ b/20
@@ -98,8 +98,8 @@ Start mining.
 Windows
-The following in how the Windows binary releases are built. It's old and
+Precompiled Windows binaries are built on a Linux host using Mingw
-not very good but it works, for me anyway.
+with a more recent compiler than the following Windows hosted procedure.
 Building on Windows prerequisites:
@@ -131,7 +131,7 @@ or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt
-Run winbuild.sh to build on Windows or execute the following commands.
+Run build.sh to build on Windows or execute the following commands.
 ./autogen.sh
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
@@ -159,6 +159,20 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------
 v3.8.1.1
 Fixed Windows AVX2 crash.
 v3.8.1
 Fixes x16r on CPUs with only SSE2.
 More Optimizations for X algos, qubit & deep.
 Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
 v3.8.0.1
 Fixed x16r AVX2 low hash rate.
 v3.8.0
 4way no longer a seperate feature, included in AVX2.
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -553,22 +553,22 @@ do { \
                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm_byteswap_32( *(buf +  0) ); \
+	M[0x0] = mm_bswap_32( *(buf +  0) ); \
-	M[0x1] = mm_byteswap_32( *(buf +  1) ); \
+	M[0x1] = mm_bswap_32( *(buf +  1) ); \
-	M[0x2] = mm_byteswap_32( *(buf +  2) ); \
+	M[0x2] = mm_bswap_32( *(buf +  2) ); \
-	M[0x3] = mm_byteswap_32( *(buf +  3) ); \
+	M[0x3] = mm_bswap_32( *(buf +  3) ); \
-	M[0x4] = mm_byteswap_32( *(buf +  4) ); \
+	M[0x4] = mm_bswap_32( *(buf +  4) ); \
-	M[0x5] = mm_byteswap_32( *(buf +  5) ); \
+	M[0x5] = mm_bswap_32( *(buf +  5) ); \
-	M[0x6] = mm_byteswap_32( *(buf +  6) ); \
+	M[0x6] = mm_bswap_32( *(buf +  6) ); \
-	M[0x7] = mm_byteswap_32( *(buf +  7) ); \
+	M[0x7] = mm_bswap_32( *(buf +  7) ); \
-	M[0x8] = mm_byteswap_32( *(buf +  8) ); \
+	M[0x8] = mm_bswap_32( *(buf +  8) ); \
-	M[0x9] = mm_byteswap_32( *(buf +  9) ); \
+	M[0x9] = mm_bswap_32( *(buf +  9) ); \
-	M[0xA] = mm_byteswap_32( *(buf + 10) ); \
+	M[0xA] = mm_bswap_32( *(buf + 10) ); \
-	M[0xB] = mm_byteswap_32( *(buf + 11) ); \
+	M[0xB] = mm_bswap_32( *(buf + 11) ); \
-	M[0xC] = mm_byteswap_32( *(buf + 12) ); \
+	M[0xC] = mm_bswap_32( *(buf + 12) ); \
-	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
+	M[0xD] = mm_bswap_32( *(buf + 13) ); \
-	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
+	M[0xE] = mm_bswap_32( *(buf + 14) ); \
-	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
+	M[0xF] = mm_bswap_32( *(buf + 15) ); \
 	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -615,22 +615,22 @@ do { \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm_byteswap_32( * buf ); \
+   M0 = mm_bswap_32( * buf ); \
-   M1 = mm_byteswap_32( *(buf+1) ); \
+   M1 = mm_bswap_32( *(buf+1) ); \
-   M2 = mm_byteswap_32( *(buf+2) ); \
+   M2 = mm_bswap_32( *(buf+2) ); \
-   M3 = mm_byteswap_32( *(buf+3) ); \
+   M3 = mm_bswap_32( *(buf+3) ); \
-   M4 = mm_byteswap_32( *(buf+4) ); \
+   M4 = mm_bswap_32( *(buf+4) ); \
-   M5 = mm_byteswap_32( *(buf+5) ); \
+   M5 = mm_bswap_32( *(buf+5) ); \
-   M6 = mm_byteswap_32( *(buf+6) ); \
+   M6 = mm_bswap_32( *(buf+6) ); \
-   M7 = mm_byteswap_32( *(buf+7) ); \
+   M7 = mm_bswap_32( *(buf+7) ); \
-   M8 = mm_byteswap_32( *(buf+8) ); \
+   M8 = mm_bswap_32( *(buf+8) ); \
-   M9 = mm_byteswap_32( *(buf+9) ); \
+   M9 = mm_bswap_32( *(buf+9) ); \
-   MA = mm_byteswap_32( *(buf+10) ); \
+   MA = mm_bswap_32( *(buf+10) ); \
-   MB = mm_byteswap_32( *(buf+11) ); \
+   MB = mm_bswap_32( *(buf+11) ); \
-   MC = mm_byteswap_32( *(buf+12) ); \
+   MC = mm_bswap_32( *(buf+12) ); \
-   MD = mm_byteswap_32( *(buf+13) ); \
+   MD = mm_bswap_32( *(buf+13) ); \
-   ME = mm_byteswap_32( *(buf+14) ); \
+   ME = mm_bswap_32( *(buf+14) ); \
-   MF = mm_byteswap_32( *(buf+15) ); \
+   MF = mm_bswap_32( *(buf+15) ); \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
   ROUND_S_4WAY(2); \
@@ -727,22 +727,22 @@ do { \
   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
-   M0 = mm256_byteswap_32( * buf ); \
+   M0 = mm256_bswap_32( * buf ); \
-   M1 = mm256_byteswap_32( *(buf+1) ); \
+   M1 = mm256_bswap_32( *(buf+1) ); \
-   M2 = mm256_byteswap_32( *(buf+2) ); \
+   M2 = mm256_bswap_32( *(buf+2) ); \
-   M3 = mm256_byteswap_32( *(buf+3) ); \
+   M3 = mm256_bswap_32( *(buf+3) ); \
-   M4 = mm256_byteswap_32( *(buf+4) ); \
+   M4 = mm256_bswap_32( *(buf+4) ); \
-   M5 = mm256_byteswap_32( *(buf+5) ); \
+   M5 = mm256_bswap_32( *(buf+5) ); \
-   M6 = mm256_byteswap_32( *(buf+6) ); \
+   M6 = mm256_bswap_32( *(buf+6) ); \
-   M7 = mm256_byteswap_32( *(buf+7) ); \
+   M7 = mm256_bswap_32( *(buf+7) ); \
-   M8 = mm256_byteswap_32( *(buf+8) ); \
+   M8 = mm256_bswap_32( *(buf+8) ); \
-   M9 = mm256_byteswap_32( *(buf+9) ); \
+   M9 = mm256_bswap_32( *(buf+9) ); \
-   MA = mm256_byteswap_32( *(buf+10) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
-   MB = mm256_byteswap_32( *(buf+11) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
-   MC = mm256_byteswap_32( *(buf+12) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
-   MD = mm256_byteswap_32( *(buf+13) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
-   ME = mm256_byteswap_32( *(buf+14) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
-   MF = mm256_byteswap_32( *(buf+15) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -848,22 +848,22 @@ do { \
                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
-	M[0x0] = mm256_byteswap_64( *(buf+0) ); \
+	M[0x0] = mm256_bswap_64( *(buf+0) ); \
-	M[0x1] = mm256_byteswap_64( *(buf+1) ); \
+	M[0x1] = mm256_bswap_64( *(buf+1) ); \
-	M[0x2] = mm256_byteswap_64( *(buf+2) ); \
+	M[0x2] = mm256_bswap_64( *(buf+2) ); \
-	M[0x3] = mm256_byteswap_64( *(buf+3) ); \
+	M[0x3] = mm256_bswap_64( *(buf+3) ); \
-	M[0x4] = mm256_byteswap_64( *(buf+4) ); \
+	M[0x4] = mm256_bswap_64( *(buf+4) ); \
-	M[0x5] = mm256_byteswap_64( *(buf+5) ); \
+	M[0x5] = mm256_bswap_64( *(buf+5) ); \
-	M[0x6] = mm256_byteswap_64( *(buf+6) ); \
+	M[0x6] = mm256_bswap_64( *(buf+6) ); \
-	M[0x7] = mm256_byteswap_64( *(buf+7) ); \
+	M[0x7] = mm256_bswap_64( *(buf+7) ); \
-	M[0x8] = mm256_byteswap_64( *(buf+8) ); \
+	M[0x8] = mm256_bswap_64( *(buf+8) ); \
-	M[0x9] = mm256_byteswap_64( *(buf+9) ); \
+	M[0x9] = mm256_bswap_64( *(buf+9) ); \
-	M[0xA] = mm256_byteswap_64( *(buf+10) ); \
+	M[0xA] = mm256_bswap_64( *(buf+10) ); \
-	M[0xB] = mm256_byteswap_64( *(buf+11) ); \
+	M[0xB] = mm256_bswap_64( *(buf+11) ); \
-	M[0xC] = mm256_byteswap_64( *(buf+12) ); \
+	M[0xC] = mm256_bswap_64( *(buf+12) ); \
-	M[0xD] = mm256_byteswap_64( *(buf+13) ); \
+	M[0xD] = mm256_bswap_64( *(buf+13) ); \
-	M[0xE] = mm256_byteswap_64( *(buf+14) ); \
+	M[0xE] = mm256_bswap_64( *(buf+14) ); \
-	M[0xF] = mm256_byteswap_64( *(buf+15) ); \
+	M[0xF] = mm256_bswap_64( *(buf+15) ); \
 	for (r = 0; r < 16; r ++) \
 		ROUND_B_4WAY(r); \
        H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -913,22 +913,22 @@ do { \
                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_byteswap_64( *(buf + 0) ); \
+     M0 = mm256_bswap_64( *(buf + 0) ); \
-     M1 = mm256_byteswap_64( *(buf + 1) ); \
+     M1 = mm256_bswap_64( *(buf + 1) ); \
-     M2 = mm256_byteswap_64( *(buf + 2) ); \
+     M2 = mm256_bswap_64( *(buf + 2) ); \
-     M3 = mm256_byteswap_64( *(buf + 3) ); \
+     M3 = mm256_bswap_64( *(buf + 3) ); \
-     M4 = mm256_byteswap_64( *(buf + 4) ); \
+     M4 = mm256_bswap_64( *(buf + 4) ); \
-     M5 = mm256_byteswap_64( *(buf + 5) ); \
+     M5 = mm256_bswap_64( *(buf + 5) ); \
-     M6 = mm256_byteswap_64( *(buf + 6) ); \
+     M6 = mm256_bswap_64( *(buf + 6) ); \
-     M7 = mm256_byteswap_64( *(buf + 7) ); \
+     M7 = mm256_bswap_64( *(buf + 7) ); \
-     M8 = mm256_byteswap_64( *(buf + 8) ); \
+     M8 = mm256_bswap_64( *(buf + 8) ); \
-     M9 = mm256_byteswap_64( *(buf + 9) ); \
+     M9 = mm256_bswap_64( *(buf + 9) ); \
-     MA = mm256_byteswap_64( *(buf + 10) ); \
+     MA = mm256_bswap_64( *(buf + 10) ); \
-     MB = mm256_byteswap_64( *(buf + 11) ); \
+     MB = mm256_bswap_64( *(buf + 11) ); \
-     MC = mm256_byteswap_64( *(buf + 12) ); \
+     MC = mm256_bswap_64( *(buf + 12) ); \
-     MD = mm256_byteswap_64( *(buf + 13) ); \
+     MD = mm256_bswap_64( *(buf + 13) ); \
-     ME = mm256_byteswap_64( *(buf + 14) ); \
+     ME = mm256_bswap_64( *(buf + 14) ); \
-     MF = mm256_byteswap_64( *(buf + 15) ); \
+     MF = mm256_bswap_64( *(buf + 15) ); \
     ROUND_B_4WAY(0); \
     ROUND_B_4WAY(1); \
     ROUND_B_4WAY(2); \
@@ -1064,8 +1064,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
                                        _mm_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+       *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
+       *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
   }
   else
@@ -1077,13 +1077,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 	memset_zero_128( u.buf, 56>>2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+        *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
+        *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm_byteswap_32( sc->H[k] );
+        out[k] = mm_bswap_32( sc->H[k] );
 }
 #if defined (__AVX2__)
@@ -1187,8 +1187,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
                                           _mm256_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
+       *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
+       *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
       blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
   }
   else
@@ -1200,13 +1200,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
        memset_zero_256( u.buf, 56>>2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
+        *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
+        *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
        blake32_8way( sc, u.buf, 64 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm256_byteswap_32( sc->H[k] );
+        out[k] = mm256_bswap_32( sc->H[k] );
 }
 // Blake-512 4 way
@@ -1311,9 +1311,9 @@ blake64_4way_close( blake_4way_big_context *sc,
       if ( out_size_w64 == 8 )
          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+       *(u.buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(u.buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
@@ -1328,16 +1328,16 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
           u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+       *(u.buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(u.buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
       blake64_4way( sc, u.buf, 128 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_byteswap_64( sc->H[k] );
+       out[k] = mm256_bswap_64( sc->H[k] );
 }
 #endif
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -51,7 +51,9 @@ extern "C"{
 // BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash
 // while lanes 1 & 3 produce invalid hash. The cause is not known.
-
+// Some things that could cause it are: using epi64 instead of epi32,
 // a memory write that is the wrong size, an attempt to index a vector
 // like an array (only works for 64 bit elements).  
 static const sph_u32 IV256[] = {
@@ -984,7 +986,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   }
   memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
   buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
-   buf[ (buf_size - 4) >> 2 ] = mm_zero;
+   buf[ (buf_size - 4) >> 2 ] = m128_zero;
   compress_small( buf, h, h2 );
   for ( u = 0; u < 16; u ++ )
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -129,7 +129,7 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform
-// Ccubehash context initializing is very expensive.
+// Cubehash context initializing is very expensive.
 // Cache the intial value for faster reinitializing.
 cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -20,11 +20,11 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-#ifndef USE_SPH_SHA
+//#ifndef USE_SPH_SHA
-   SHA256_CTX         sha;
+//   SHA256_CTX         sha;
-#else
+//#else
   sph_sha256_context sha;
-#endif
+//#endif
 } myrgr_ctx_holder;
 myrgr_ctx_holder myrgr_ctx;
@@ -36,11 +36,11 @@ void init_myrgr_ctx()
 #else
     init_groestl (&myrgr_ctx.groestl, 64 );
 #endif
-#ifndef USE_SPH_SHA
+//#ifndef USE_SPH_SHA
-   SHA256_Init( &myrgr_ctx.sha );
+//   SHA256_Init( &myrgr_ctx.sha );
-#else
+//#else
   sph_sha256_init( &myrgr_ctx.sha );
-#endif
+//#endif
 }
 void myriadhash( void *output, const void *input )
@@ -57,13 +57,13 @@ void myriadhash( void *output, const void *input )
                               (const char*)input, 640 );
 #endif
-#ifndef USE_SPH_SHA
+//#ifndef USE_SPH_SHA
-     SHA256_Update( &ctx.sha, hash, 64 );
+//     SHA256_Update( &ctx.sha, hash, 64 );
-     SHA256_Final( (unsigned char*) hash, &ctx.sha );
+//     SHA256_Final( (unsigned char*) hash, &ctx.sha );
-#else
+//#else
     sph_sha256(&ctx.sha, hash, 64);
     sph_sha256_close(&ctx.sha, hash);
-#endif
+//#endif
     memcpy(output, hash, 32);
 }
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -48,20 +48,20 @@ extern "C"{
 #define SPH_SIZE_hamsi512   512
 // Partial is only scalar but needs pointer ref for hamsi-helper
 // deprecate partial_len
 typedef struct {
-   __m128i h[16];
+   __m256i h[8];
-   __m128i partial[2];
+   __m256i buf[1];
   size_t partial_len;
   sph_u32 count_high, count_low;
 } hamsi_4way_big_context;
 typedef hamsi_4way_big_context hamsi512_4way_context;
-void hamsi512_4way_init(void *cc);
+void hamsi512_4way_init( hamsi512_4way_context *sc );
-
+void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
-void hamsi512_4way(void *cc, const void *data, size_t len);
+void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 void hamsi512_4way_close(void *cc, void *dst);
 #ifdef __cplusplus
 }
--- a/algo/hamsi/hamsi-helper-4way.c
+++ b/algo/hamsi/hamsi-helper-4way.c
@@ -1,482 +0,0 @@
 /* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
 /*
 * Helper code for Hamsi (input block expansion). This code is
 * automatically generated and includes precomputed tables for
 * expansion code which handles 2 to 8 bits at a time.
 *
 * This file is included from hamsi.c, and is not meant to be compiled
 * independently.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #ifdef __cplusplus
 extern "C"{
 #endif
 /* Note: this table lists bits within each byte from least
   siginificant to most significant. */
 static const sph_u32 T512[64][16] = {
 	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
 	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
 	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
 	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
 	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
 	  SPH_C32(0x9e69af68) },
 	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
 	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
 	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
 	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
 	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
 	  SPH_C32(0x0c26f262) },
 	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
 	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
 	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
 	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
 	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
 	  SPH_C32(0xdc24e61f) },
 	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
 	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
 	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
 	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
 	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
 	  SPH_C32(0x3daac2da) },
 	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
 	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
 	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
 	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
 	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
 	  SPH_C32(0x78cace29) },
 	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
 	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
 	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
 	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
 	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
 	  SPH_C32(0x2dd1f9ab) },
 	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
 	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
 	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
 	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
 	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
 	  SPH_C32(0xbf2c0be2) },
 	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
 	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
 	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
 	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
 	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
 	  SPH_C32(0x32219526) },
 	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
 	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
 	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
 	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
 	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
 	  SPH_C32(0xac8e6c88) },
 	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
 	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
 	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
 	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
 	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
 	  SPH_C32(0x7b1bd6b9) },
 	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
 	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
 	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
 	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
 	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
 	  SPH_C32(0xf746c320) },
 	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
 	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
 	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
 	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
 	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
 	  SPH_C32(0x69505b3a) },
 	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
 	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
 	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
 	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
 	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
 	  SPH_C32(0x8a341574) },
 	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
 	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
 	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
 	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
 	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
 	  SPH_C32(0x450360bf) },
 	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
 	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
 	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
 	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
 	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
 	  SPH_C32(0xf3d45758) },
 	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
 	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
 	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
 	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
 	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
 	  SPH_C32(0x925c44e9) },
 	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
 	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
 	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
 	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
 	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
 	  SPH_C32(0xa123ff9f) },
 	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
 	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
 	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
 	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
 	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
 	  SPH_C32(0x1568ff0f) },
 	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
 	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
 	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
 	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
 	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
 	  SPH_C32(0xc5c1eb3e) },
 	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
 	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
 	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
 	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
 	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
 	  SPH_C32(0x1af21fe1) },
 	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
 	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
 	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
 	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
 	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
 	  SPH_C32(0x857f3c2b) },
 	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
 	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
 	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
 	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
 	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
 	  SPH_C32(0x2ba05a55) },
 	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
 	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
 	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
 	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
 	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
 	  SPH_C32(0xfeabf254) },
 	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
 	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
 	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
 	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
 	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
 	  SPH_C32(0xfe1cdc7f) },
 	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
 	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
 	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
 	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
 	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
 	  SPH_C32(0xb0a51834) },
 	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
 	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
 	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
 	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
 	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
 	  SPH_C32(0xa6b8c28d) },
 	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
 	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
 	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
 	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
 	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
 	  SPH_C32(0x3a4e99d7) },
 	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
 	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
 	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
 	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
 	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
 	  SPH_C32(0xe1844257) },
 	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
 	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
 	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
 	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
 	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
 	  SPH_C32(0x2c3b504e) },
 	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
 	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
 	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
 	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
 	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
 	  SPH_C32(0x524a0d59) },
 	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
 	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
 	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
 	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
 	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
 	  SPH_C32(0x378dd173) },
 	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
 	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
 	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
 	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
 	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
 	  SPH_C32(0x8b6c72bd) },
 	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
 	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
 	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
 	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
 	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
 	  SPH_C32(0x8e67b7fa) },
 	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
 	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
 	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
 	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
 	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
 	  SPH_C32(0x443d3004) },
 	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
 	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
 	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
 	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
 	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
 	  SPH_C32(0xf4f6ea7b) },
 	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
 	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
 	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
 	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
 	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
 	  SPH_C32(0x979961d0) },
 	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
 	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
 	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
 	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
 	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
 	  SPH_C32(0x98aa496e) },
 	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
 	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
 	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
 	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
 	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
 	  SPH_C32(0x094e3198) },
 	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
 	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
 	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
 	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
 	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
 	  SPH_C32(0xe86cba2e) },
 	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
 	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
 	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
 	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
 	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
 	  SPH_C32(0x4b7eec55) },
 	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
 	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
 	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
 	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
 	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
 	  SPH_C32(0x1e7536a6) },
 	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
 	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
 	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
 	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
 	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
 	  SPH_C32(0x24314f17) },
 	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
 	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
 	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
 	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
 	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
 	  SPH_C32(0x9075b1ce) },
 	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
 	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
 	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
 	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
 	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
 	  SPH_C32(0x9b6ef888) },
 	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
 	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
 	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
 	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
 	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
 	  SPH_C32(0xd8b61463) },
 	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
 	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
 	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
 	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
 	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
 	  SPH_C32(0x3ea660f7) },
 	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
 	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
 	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
 	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
 	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
 	  SPH_C32(0x7f975691) },
 	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
 	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
 	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
 	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
 	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
 	  SPH_C32(0x2c94459e) },
 	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
 	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
 	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
 	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
 	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
 	  SPH_C32(0x56a7b19f) },
 	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
 	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
 	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
 	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
 	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
 	  SPH_C32(0x81fdf908) },
 	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
 	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
 	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
 	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
 	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
 	  SPH_C32(0x5bd61539) },
 	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
 	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
 	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
 	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
 	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
 	  SPH_C32(0x15b961e7) },
 	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
 	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
 	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
 	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
 	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
 	  SPH_C32(0x2a2c18f0) },
 	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
 	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
 	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
 	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
 	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
 	  SPH_C32(0x551e3d6e) },
 	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
 	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
 	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
 	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
 	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
 	  SPH_C32(0x33c5244f) },
 	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
 	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
 	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
 	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
 	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
 	  SPH_C32(0x8a58e6a4) },
 	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
 	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
 	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
 	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
 	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
 	  SPH_C32(0xda878000) },
 	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
 	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
 	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
 	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
 	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
 	  SPH_C32(0x3c5dfffe) },
 	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
 	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
 	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
 	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
 	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
 	  SPH_C32(0x7b1675d7) },
 	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
 	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
 	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
 	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
 	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
 	  SPH_C32(0x2879ebac) },
 	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
 	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
 	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
 	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
 	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
 	  SPH_C32(0xbe0a679e) },
 	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
 	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
 	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
 	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
 	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
 	  SPH_C32(0x30aebcf7) },
 	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
 	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
 	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
 	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
 	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
 	  SPH_C32(0xc7ff60f0) },
 	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
 	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
 	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
 	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
 	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
 	  SPH_C32(0xe7e00a94) }
 };
 #define U_BIG( n ) \
 do { \
  __m128i db = buf[n]; \
  for ( int u = 0; u < 32; u++ ) \
  { \
     __m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
     m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
     db = _mm_srli_epi32( db, 1 ); \
  } \
 } while (0);
 #define INPUT_BIG \
 do { \
  const sph_u32 *tp = &T512[0][0]; \
  m0 = mm_zero; \
  m1 = mm_zero; \
  m2 = mm_zero; \
  m3 = mm_zero; \
  m4 = mm_zero; \
  m5 = mm_zero; \
  m6 = mm_zero; \
  m7 = mm_zero; \
  m8 = mm_zero; \
  m9 = mm_zero; \
  mA = mm_zero; \
  mB = mm_zero; \
  mC = mm_zero; \
  mD = mm_zero; \
  mE = mm_zero; \
  mF = mm_zero; \
  U_BIG( 0 ); \
  U_BIG( 1 ); \
 } while (0)
 #ifdef __cplusplus
 }
 #endif
--- a/algo/hamsi/sph_hamsi.c.test
+++ b/algo/hamsi/sph_hamsi.c.test
@@ -0,0 +1,940 @@
 /* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
 /*
 * Hamsi implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #include <stddef.h>
 #include <string.h>
 #include "sph_hamsi.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI
 #define SPH_SMALL_FOOTPRINT_HAMSI   1
 #endif
 /*
 * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
 * table lookup during message expansion (1 to 8, inclusive). If we note
 * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
 * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
 * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
 * then we will get t tables (where t=ceil(w/n)) of individual size
 * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
 * n=5, there are 7 tables, but the last one uses only two bits on
 * input, not five).
 *
 * Also, we read t rows of r words from RAM. Words in a given row are
 * concatenated in RAM in that order, so most of the cost is about
 * reading the first row word; comparatively, cache misses are thus
 * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
 *
 * When n=1, tables are "special" in that we omit the first entry of
 * each table (which always contains 0), so that total table size is
 * halved.
 *
 * We thus have the following (size1 is the cumulative table size of
 * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
 * are for Hamsi-224/256 and Hamsi-384/512, respectively).
 *
 *   n      size1      size2    t1    t2
 * ---------------------------------------
 *   1       1024       4096    32    64
 *   2       2048       8192    16    32
 *   3       2688      10880    11    22
 *   4       4096      16384     8    16
 *   5       6272      25600     7    13
 *   6      10368      41984     6    11
 *   7      16896      73856     5    10
 *   8      32768     131072     4     8
 *
 * So there is a trade-off: a lower n makes the tables fit better in
 * L1 cache, but increases the number of memory accesses. The optimal
 * value depends on the amount of available L1 cache and the relative
 * impact of a cache miss.
 *
 * Experimentally, in ideal benchmark conditions (which are not necessarily
 * realistic with regards to L1 cache contention), it seems that n=8 is
 * the best value on "big" architectures (those with 32 kB or more of L1
 * cache), while n=4 is better on "small" architectures. This was tested
 * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
 * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
 * (8 kB L1 cache).
 *
 * Note: with n=1, the 32 tables (actually implemented as one big table)
 * are read entirely and sequentially, regardless of the input data,
 * thus avoiding any data-dependent table access pattern.
 */
 #if !defined SPH_HAMSI_EXPAND_SMALL
 #if SPH_SMALL_FOOTPRINT_HAMSI
 #define SPH_HAMSI_EXPAND_SMALL  4
 #else
 #define SPH_HAMSI_EXPAND_SMALL  8
 #endif
 #endif
 #if !defined SPH_HAMSI_EXPAND_BIG
 #define SPH_HAMSI_EXPAND_BIG    8
 #endif
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif
 #include "sph_hamsi_helper.c"
 static const sph_u32 IV224[] = {
 	SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
 	SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
 	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
 };
 /*
 * This version is the one used in the Hamsi submission package for
 * round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and
 * shall soon be corrected in the official Hamsi specification.
 *
 static const sph_u32 IV224[] = {
 	SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3),
 	SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
 	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
 };
 */
 static const sph_u32 IV256[] = {
 	SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65),
 	SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274),
 	SPH_C32(0x656d656e), SPH_C32(0x7420456c)
 };
 static const sph_u32 IV384[] = {
 	SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965),
 	SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220),
 	SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64),
 	SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20),
 	SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879),
 	SPH_C32(0x2c204b61)
 };
 static const sph_u32 IV512[] = {
 	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
 	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
 	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
 	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
 	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
 	SPH_C32(0x6769756d)
 };
 static const sph_u32 alpha_n[] = {
 	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
 	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
 	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
 	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
 	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
 	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
 	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
 };
 static const sph_u32 alpha_f[] = {
 	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
 	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
 	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
 	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
 	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
 	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
 	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
 };
 #define DECL_STATE_SMALL \
 	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7;
 #define READ_STATE_SMALL(sc)   do { \
 		c0 = sc->h[0x0]; \
 		c1 = sc->h[0x1]; \
 		c2 = sc->h[0x2]; \
 		c3 = sc->h[0x3]; \
 		c4 = sc->h[0x4]; \
 		c5 = sc->h[0x5]; \
 		c6 = sc->h[0x6]; \
 		c7 = sc->h[0x7]; \
 	} while (0)
 #define WRITE_STATE_SMALL(sc)   do { \
 		sc->h[0x0] = c0; \
 		sc->h[0x1] = c1; \
 		sc->h[0x2] = c2; \
 		sc->h[0x3] = c3; \
 		sc->h[0x4] = c4; \
 		sc->h[0x5] = c5; \
 		sc->h[0x6] = c6; \
 		sc->h[0x7] = c7; \
 	} while (0)
 #define s0   m0
 #define s1   m1
 #define s2   c0
 #define s3   c1
 #define s4   c2
 #define s5   c3
 #define s6   m2
 #define s7   m3
 #define s8   m4
 #define s9   m5
 #define sA   c4
 #define sB   c5
 #define sC   c6
 #define sD   c7
 #define sE   m6
 #define sF   m7
 #define SBOX(a, b, c, d)   do { \
 		sph_u32 t; \
 		t = (a); \
 		(a) &= (c); \
 		(a) ^= (d); \
 		(c) ^= (b); \
 		(c) ^= (a); \
 		(d) |= t; \
 		(d) ^= (b); \
 		t ^= (c); \
 		(b) = (d); \
 		(d) |= t; \
 		(d) ^= (a); \
 		(a) &= (b); \
 		t ^= (a); \
 		(b) ^= (d); \
 		(b) ^= t; \
 		(a) = (c); \
 		(c) = (b); \
 		(b) = (d); \
 		(d) = SPH_T32(~t); \
 	} while (0)
 #define L(a, b, c, d)   do { \
 		(a) = SPH_ROTL32(a, 13); \
 		(c) = SPH_ROTL32(c, 3); \
 		(b) ^= (a) ^ (c); \
 		(d) ^= (c) ^ SPH_T32((a) << 3); \
 		(b) = SPH_ROTL32(b, 1); \
 		(d) = SPH_ROTL32(d, 7); \
 		(a) ^= (b) ^ (d); \
 		(c) ^= (d) ^ SPH_T32((b) << 7); \
 		(a) = SPH_ROTL32(a, 5); \
 		(c) = SPH_ROTL32(c, 22); \
 	} while (0)
 #define ROUND_SMALL(rc, alpha)   do { \
 		s0 ^= alpha[0x00]; \
 		s1 ^= alpha[0x01] ^ (sph_u32)(rc); \
 		s2 ^= alpha[0x02]; \
 		s3 ^= alpha[0x03]; \
 		s4 ^= alpha[0x08]; \
 		s5 ^= alpha[0x09]; \
 		s6 ^= alpha[0x0A]; \
 		s7 ^= alpha[0x0B]; \
 		s8 ^= alpha[0x10]; \
 		s9 ^= alpha[0x11]; \
 		sA ^= alpha[0x12]; \
 		sB ^= alpha[0x13]; \
 		sC ^= alpha[0x18]; \
 		sD ^= alpha[0x19]; \
 		sE ^= alpha[0x1A]; \
 		sF ^= alpha[0x1B]; \
 		SBOX(s0, s4, s8, sC); \
 		SBOX(s1, s5, s9, sD); \
 		SBOX(s2, s6, sA, sE); \
 		SBOX(s3, s7, sB, sF); \
 		L(s0, s5, sA, sF); \
 		L(s1, s6, sB, sC); \
 		L(s2, s7, s8, sD); \
 		L(s3, s4, s9, sE); \
 	} while (0)
 #define P_SMALL   do { \
 		ROUND_SMALL(0, alpha_n); \
 		ROUND_SMALL(1, alpha_n); \
 		ROUND_SMALL(2, alpha_n); \
 	} while (0)
 #define PF_SMALL   do { \
 		ROUND_SMALL(0, alpha_f); \
 		ROUND_SMALL(1, alpha_f); \
 		ROUND_SMALL(2, alpha_f); \
 		ROUND_SMALL(3, alpha_f); \
 		ROUND_SMALL(4, alpha_f); \
 		ROUND_SMALL(5, alpha_f); \
 	} while (0)
 #define T_SMALL   do { \
 		/* order is important */ \
 		c7 = (sc->h[7] ^= sB); \
 		c6 = (sc->h[6] ^= sA); \
 		c5 = (sc->h[5] ^= s9); \
 		c4 = (sc->h[4] ^= s8); \
 		c3 = (sc->h[3] ^= s3); \
 		c2 = (sc->h[2] ^= s2); \
 		c1 = (sc->h[1] ^= s1); \
 		c0 = (sc->h[0] ^= s0); \
 	} while (0)
 static void
 hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num)
 {
 	DECL_STATE_SMALL
 #if !SPH_64
 	sph_u32 tmp;
 #endif
 #if SPH_64
 	sc->count += (sph_u64)num << 5;
 #else
 	tmp = SPH_T32((sph_u32)num << 5);
 	sc->count_low = SPH_T32(sc->count_low + tmp);
 	sc->count_high += (sph_u32)((num >> 13) >> 14);
 	if (sc->count_low < tmp)
 		sc->count_high ++;
 #endif
 	READ_STATE_SMALL(sc);
 	while (num -- > 0) {
 		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
 		INPUT_SMALL;
 		P_SMALL;
 		T_SMALL;
 		buf += 4;
 	}
 	WRITE_STATE_SMALL(sc);
 }
 static void
 hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf)
 {
 	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
 	DECL_STATE_SMALL
 	READ_STATE_SMALL(sc);
 	INPUT_SMALL;
 	PF_SMALL;
 	T_SMALL;
 	WRITE_STATE_SMALL(sc);
 }
 static void
 hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv)
 {
 	sc->partial_len = 0;
 	memcpy(sc->h, iv, sizeof sc->h);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 static void
 hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len)
 {
 	if (sc->partial_len != 0) {
 		size_t mlen;
 		mlen = 4 - sc->partial_len;
 		if (len < mlen) {
 			memcpy(sc->partial + sc->partial_len, data, len);
 			sc->partial_len += len;
 			return;
 		} else {
 			memcpy(sc->partial + sc->partial_len, data, mlen);
 			len -= mlen;
 			data = (const unsigned char *)data + mlen;
 			hamsi_small(sc, sc->partial, 1);
 			sc->partial_len = 0;
 		}
 	}
 	hamsi_small(sc, data, (len >> 2));
 	data = (const unsigned char *)data + (len & ~(size_t)3);
 	len &= (size_t)3;
 	memcpy(sc->partial, data, len);
 	sc->partial_len = len;
 }
 static void
 hamsi_small_close(sph_hamsi_small_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
 {
 	unsigned char pad[12];
 	size_t ptr, u;
 	unsigned z;
 	unsigned char *out;
 	ptr = sc->partial_len;
 	memcpy(pad, sc->partial, ptr);
 #if SPH_64
 	sph_enc64be(pad + 4, sc->count + (ptr << 3) + n);
 #else
 	sph_enc32be(pad + 4, sc->count_high);
 	sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n);
 #endif
 	z = 0x80 >> n;
 	pad[ptr ++] = ((ub & -z) | z) & 0xFF;
 	while (ptr < 4)
 		pad[ptr ++] = 0;
 	hamsi_small(sc, pad, 2);
 	hamsi_small_final(sc, pad + 8);
 	out = dst;
 	for (u = 0; u < out_size_w32; u ++)
 		sph_enc32be(out + (u << 2), sc->h[u]);
 }
 #define DECL_STATE_BIG \
 	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \
 	sph_u32 c8, c9, cA, cB, cC, cD, cE, cF;
 #define READ_STATE_BIG(sc)   do { \
 		c0 = sc->h[0x0]; \
 		c1 = sc->h[0x1]; \
 		c2 = sc->h[0x2]; \
 		c3 = sc->h[0x3]; \
 		c4 = sc->h[0x4]; \
 		c5 = sc->h[0x5]; \
 		c6 = sc->h[0x6]; \
 		c7 = sc->h[0x7]; \
 		c8 = sc->h[0x8]; \
 		c9 = sc->h[0x9]; \
 		cA = sc->h[0xA]; \
 		cB = sc->h[0xB]; \
 		cC = sc->h[0xC]; \
 		cD = sc->h[0xD]; \
 		cE = sc->h[0xE]; \
 		cF = sc->h[0xF]; \
 	} while (0)
 #define WRITE_STATE_BIG(sc)   do { \
 		sc->h[0x0] = c0; \
 		sc->h[0x1] = c1; \
 		sc->h[0x2] = c2; \
 		sc->h[0x3] = c3; \
 		sc->h[0x4] = c4; \
 		sc->h[0x5] = c5; \
 		sc->h[0x6] = c6; \
 		sc->h[0x7] = c7; \
 		sc->h[0x8] = c8; \
 		sc->h[0x9] = c9; \
 		sc->h[0xA] = cA; \
 		sc->h[0xB] = cB; \
 		sc->h[0xC] = cC; \
 		sc->h[0xD] = cD; \
 		sc->h[0xE] = cE; \
 		sc->h[0xF] = cF; \
 	} while (0)
 #define s00   m0
 #define s01   m1
 #define s02   c0
 #define s03   c1
 #define s04   m2
 #define s05   m3
 #define s06   c2
 #define s07   c3
 #define s08   c4
 #define s09   c5
 #define s0A   m4
 #define s0B   m5
 #define s0C   c6
 #define s0D   c7
 #define s0E   m6
 #define s0F   m7
 #define s10   m8
 #define s11   m9
 #define s12   c8
 #define s13   c9
 #define s14   mA
 #define s15   mB
 #define s16   cA
 #define s17   cB
 #define s18   cC
 #define s19   cD
 #define s1A   mC
 #define s1B   mD
 #define s1C   cE
 #define s1D   cF
 #define s1E   mE
 #define s1F   mF
 #define ROUND_BIG(rc, alpha)   do { \
 		s00 ^= alpha[0x00]; \
 		s01 ^= alpha[0x01] ^ (sph_u32)(rc); \
 		s02 ^= alpha[0x02]; \
 		s03 ^= alpha[0x03]; \
 		s04 ^= alpha[0x04]; \
 		s05 ^= alpha[0x05]; \
 		s06 ^= alpha[0x06]; \
 		s07 ^= alpha[0x07]; \
 		s08 ^= alpha[0x08]; \
 		s09 ^= alpha[0x09]; \
 		s0A ^= alpha[0x0A]; \
 		s0B ^= alpha[0x0B]; \
 		s0C ^= alpha[0x0C]; \
 		s0D ^= alpha[0x0D]; \
 		s0E ^= alpha[0x0E]; \
 		s0F ^= alpha[0x0F]; \
 		s10 ^= alpha[0x10]; \
 		s11 ^= alpha[0x11]; \
 		s12 ^= alpha[0x12]; \
 		s13 ^= alpha[0x13]; \
 		s14 ^= alpha[0x14]; \
 		s15 ^= alpha[0x15]; \
 		s16 ^= alpha[0x16]; \
 		s17 ^= alpha[0x17]; \
 		s18 ^= alpha[0x18]; \
 		s19 ^= alpha[0x19]; \
 		s1A ^= alpha[0x1A]; \
 		s1B ^= alpha[0x1B]; \
 		s1C ^= alpha[0x1C]; \
 		s1D ^= alpha[0x1D]; \
 		s1E ^= alpha[0x1E]; \
 		s1F ^= alpha[0x1F]; \
 		SBOX(s00, s08, s10, s18); \
 		SBOX(s01, s09, s11, s19); \
 		SBOX(s02, s0A, s12, s1A); \
 		SBOX(s03, s0B, s13, s1B); \
 		SBOX(s04, s0C, s14, s1C); \
 		SBOX(s05, s0D, s15, s1D); \
 		SBOX(s06, s0E, s16, s1E); \
 		SBOX(s07, s0F, s17, s1F); \
 		L(s00, s09, s12, s1B); \
 		L(s01, s0A, s13, s1C); \
 		L(s02, s0B, s14, s1D); \
 		L(s03, s0C, s15, s1E); \
 		L(s04, s0D, s16, s1F); \
 		L(s05, s0E, s17, s18); \
 		L(s06, s0F, s10, s19); \
 		L(s07, s08, s11, s1A); \
 /*if (rc == 0 ) { \
 printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
 }*/ \
 		L(s00, s02, s05, s07); \
 		L(s10, s13, s15, s16); \
 /*if (rc == 0 ) { \
 printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
 }*/ \
 		L(s09, s0B, s0C, s0E); \
 		L(s19, s1A, s1C, s1F); \
 	} while (0)
 #if SPH_SMALL_FOOTPRINT_HAMSI
 #define P_BIG   do { \
 		unsigned r; \
 		for (r = 0; r < 6; r ++) \
 			ROUND_BIG(r, alpha_n); \
 	} while (0)
 #define PF_BIG   do { \
 		unsigned r; \
 		for (r = 0; r < 12; r ++) \
 			ROUND_BIG(r, alpha_f); \
 	} while (0)
 #else
 #define P_BIG   do { \
 		ROUND_BIG(0, alpha_n); \
 /*printf("S R0 s00 %08lx s01 %08lx s02 %08lx s03 %08lx\n",s00,s01,s02,s03); \
 printf("S R0 s04 %08lx s05 %08lx s06 %08lx s07 %08lx\n",s04,s05,s06,s07); \
 printf("S R0 s08 %08lx s09 %08lx s0A %08lx s0B %08lx\n",s08,s09,s0A,s0B); \
 printf("S R0 s0C %08lx s0D %08lx s0E %08lx s0F %08lx\n",s0C,s0D,s0E,s0F); \
 printf("S R0 s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
 printf("S R0 s14 %08lx s15 %08lx s16 %08lx s17 %08lx\n",s14,s15,s16,s17); \
 printf("S R0 s18 %08lx s19 %08lx s1A %08lx s1B %08lx\n",s18,s19,s1A,s1B); \
 printf("S R0 s1C %08lx s1D %08lx s1E %08lx s1F %08lx\n",s1C,s1D,s1E,s1F); \
 */\
 		ROUND_BIG(1, alpha_n); \
 		ROUND_BIG(2, alpha_n); \
 		ROUND_BIG(3, alpha_n); \
 		ROUND_BIG(4, alpha_n); \
 		ROUND_BIG(5, alpha_n); \
 	} while (0)
 #define PF_BIG   do { \
 		ROUND_BIG(0, alpha_f); \
 		ROUND_BIG(1, alpha_f); \
 		ROUND_BIG(2, alpha_f); \
 		ROUND_BIG(3, alpha_f); \
 		ROUND_BIG(4, alpha_f); \
 		ROUND_BIG(5, alpha_f); \
 		ROUND_BIG(6, alpha_f); \
 		ROUND_BIG(7, alpha_f); \
 		ROUND_BIG(8, alpha_f); \
 		ROUND_BIG(9, alpha_f); \
 		ROUND_BIG(10, alpha_f); \
 		ROUND_BIG(11, alpha_f); \
 	} while (0)
 #endif
 #define T_BIG   do { \
 		/* order is important */ \
 		cF = (sc->h[0xF] ^= s17); \
 		cE = (sc->h[0xE] ^= s16); \
 		cD = (sc->h[0xD] ^= s15); \
 		cC = (sc->h[0xC] ^= s14); \
 		cB = (sc->h[0xB] ^= s13); \
 		cA = (sc->h[0xA] ^= s12); \
 		c9 = (sc->h[0x9] ^= s11); \
 		c8 = (sc->h[0x8] ^= s10); \
 		c7 = (sc->h[0x7] ^= s07); \
 		c6 = (sc->h[0x6] ^= s06); \
 		c5 = (sc->h[0x5] ^= s05); \
 		c4 = (sc->h[0x4] ^= s04); \
 		c3 = (sc->h[0x3] ^= s03); \
 		c2 = (sc->h[0x2] ^= s02); \
 		c1 = (sc->h[0x1] ^= s01); \
 		c0 = (sc->h[0x0] ^= s00); \
 	} while (0)
 static void
 hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num)
 {
 	DECL_STATE_BIG
 #if !SPH_64
 	sph_u32 tmp;
 #endif
 #if SPH_64
 	sc->count += (sph_u64)num << 6;
 #else
 	tmp = SPH_T32((sph_u32)num << 6);
 	sc->count_low = SPH_T32(sc->count_low + tmp);
 	sc->count_high += (sph_u32)((num >> 13) >> 13);
 	if (sc->count_low < tmp)
 		sc->count_high ++;
 #endif
 	READ_STATE_BIG(sc);
 /*
 uint32_t* b = (uint32_t*)buf;
 //printf("S s64: %016llx\n",*ss);
 //printf("S buf: %08lx %08lx\n",b[0], b[1]);
 int n1 = 1;
 int n2 = 1;
 */
 	while (num -- > 0) {
 		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
 		sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
 		INPUT_BIG;
 /*if ( n1 ) 
 {
 n1 = 0;
 printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m0,m1,m2,m3 );
 printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m4,m5,m6,m7);
 printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m8,m9,mA,mB );
 printf("S INPUT m: %08lx %08lx %08lx %08lx\n",mC,mD,mE,mF);
 }
 */
 		P_BIG;
 /*if ( n2 )        
 {
 n2 = 0;
 printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s00,s01,s02,s03 );
 printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s04,s05,s07,s07);
 printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s08,s09,s0A,s0B );
 printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s0C,s0D,s0E,s0F);
 }
 */
 		T_BIG;
 		buf += 8;
 	}
 	WRITE_STATE_BIG(sc);
 }
 static void
 hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf)
 {
 	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
 	sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
 	DECL_STATE_BIG
 	READ_STATE_BIG(sc);
 	INPUT_BIG;
 	PF_BIG;
 	T_BIG;
 	WRITE_STATE_BIG(sc);
 }
 static void
 hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv)
 {
 	sc->partial_len = 0;
 	memcpy(sc->h, iv, sizeof sc->h);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 static void
 hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len)
 {
 uint64_t* d = (uint64_t*)data;
 uint64_t* h = (uint64_t*)sc->h;
 /*
 printf("S core1 len = %d\n",len);
 printf("S data: %016llx %016llx %016llx %016llx\n",d[0],d[1],d[2],d[3]);
 printf("S data: %016llx %016llx %016llx %016llx\n",d[4],d[5],d[6],d[7]);
 printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
 */
 	if (sc->partial_len != 0) {
 //printf("WARNING partial_len != 0\n");
 		size_t mlen;
 		mlen = 8 - sc->partial_len;
 		if (len < mlen) {
 			memcpy(sc->partial + sc->partial_len, data, len);
 			sc->partial_len += len;
 			return;
 		} else {
 			memcpy(sc->partial + sc->partial_len, data, mlen);
 			len -= mlen;
 			data = (const unsigned char *)data + mlen;
 			hamsi_big(sc, sc->partial, 1);
 			sc->partial_len = 0;
 		}
 	}
 	hamsi_big(sc, data, (len >> 3));
 /*
 printf("S core2\n");
 printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
 */
 	data = (const unsigned char *)data + (len & ~(size_t)7);
 	len &= (size_t)7;
 	memcpy(sc->partial, data, len);
 	sc->partial_len = len;
 }
 static void
 hamsi_big_close(sph_hamsi_big_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
 {
 	unsigned char pad[8];
 	size_t ptr, u;
 	unsigned z;
 	unsigned char *out;
 //uint64_t* h = (uint64_t*)sc->h;
 	ptr = sc->partial_len;
 #if SPH_64
 	sph_enc64be(pad, sc->count + (ptr << 3) + n);
 #else
 	sph_enc32be(pad, sc->count_high);
 	sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n);
 #endif
 	z = 0x80 >> n;
 	sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF;
 	while (ptr < 8)
 		sc->partial[ptr ++] = 0;
 //printf("S close1\n");
 //printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
 	hamsi_big(sc, sc->partial, 1);
 //printf("S close2\n");
 //printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
 	hamsi_big_final(sc, pad);
 //printf("S close3\n");
 //printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
 	out = dst;
 	if (out_size_w32 == 12) {
 		sph_enc32be(out +  0, sc->h[ 0]);
 		sph_enc32be(out +  4, sc->h[ 1]);
 		sph_enc32be(out +  8, sc->h[ 3]);
 		sph_enc32be(out + 12, sc->h[ 4]);
 		sph_enc32be(out + 16, sc->h[ 5]);
 		sph_enc32be(out + 20, sc->h[ 6]);
 		sph_enc32be(out + 24, sc->h[ 8]);
 		sph_enc32be(out + 28, sc->h[ 9]);
 		sph_enc32be(out + 32, sc->h[10]);
 		sph_enc32be(out + 36, sc->h[12]);
 		sph_enc32be(out + 40, sc->h[13]);
 		sph_enc32be(out + 44, sc->h[15]);
 	} else {
 		for (u = 0; u < 16; u ++)
 			sph_enc32be(out + (u << 2), sc->h[u]);
 	}
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi224_init(void *cc)
 {
 	hamsi_small_init(cc, IV224);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi224(void *cc, const void *data, size_t len)
 {
 	hamsi_small_core(cc, data, len);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi224_close(void *cc, void *dst)
 {
 	hamsi_small_close(cc, 0, 0, dst, 7);
 //	hamsi_small_init(cc, IV224);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	hamsi_small_close(cc, ub, n, dst, 7);
 //	hamsi_small_init(cc, IV224);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi256_init(void *cc)
 {
 	hamsi_small_init(cc, IV256);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi256(void *cc, const void *data, size_t len)
 {
 	hamsi_small_core(cc, data, len);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi256_close(void *cc, void *dst)
 {
 	hamsi_small_close(cc, 0, 0, dst, 8);
 //	hamsi_small_init(cc, IV256);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	hamsi_small_close(cc, ub, n, dst, 8);
 //	hamsi_small_init(cc, IV256);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi384_init(void *cc)
 {
 	hamsi_big_init(cc, IV384);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi384(void *cc, const void *data, size_t len)
 {
 	hamsi_big_core(cc, data, len);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi384_close(void *cc, void *dst)
 {
 	hamsi_big_close(cc, 0, 0, dst, 12);
 //	hamsi_big_init(cc, IV384);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	hamsi_big_close(cc, ub, n, dst, 12);
 //	hamsi_big_init(cc, IV384);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi512_init(void *cc)
 {
 	hamsi_big_init(cc, IV512);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi512(void *cc, const void *data, size_t len)
 {
 	hamsi_big_core(cc, data, len);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi512_close(void *cc, void *dst)
 {
 	hamsi_big_close(cc, 0, 0, dst, 16);
 //	hamsi_big_init(cc, IV512);
 }
 /* see sph_hamsi.h */
 void
 sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	hamsi_big_close(cc, ub, n, dst, 16);
 //	hamsi_big_init(cc, IV512);
 }
 #ifdef __cplusplus
 }
 #endif
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
   current = (unsigned)sc->count_low & 127UL;
-   sc->buf[ current>>2 ] = mm_one_32;
+   sc->buf[ current>>2 ] = m128_one_32;
   current += 4;   
   RSTATE;
   if ( current > 116UL )
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -15,7 +15,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/skein/sse2/skein.c"
 #ifndef NO_AES_NI
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -99,6 +99,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
 #endif
  return false;
 }
 bool register_hodl_algo( algo_gate_t* gate )
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input )
    for ( int round = 0; round < 3; round++ )
    {
       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
-               vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
+               vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       init_groestl( &ctx_groestl, 64 );
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
+#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
 #define XOR64_IOTA       XOR64
@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
          kc->w[i] = _mm256_setzero_si256();
   // Initialization for the "lane complement".
-   kc->w[ 1] = mm256_neg1;
+   kc->w[ 1] = m256_neg1;
-   kc->w[ 2] = mm256_neg1;
+   kc->w[ 2] = m256_neg1;
-   kc->w[ 8] = mm256_neg1;
+   kc->w[ 8] = m256_neg1;
-   kc->w[12] = mm256_neg1;
+   kc->w[12] = m256_neg1;
-   kc->w[17] = mm256_neg1;
+   kc->w[17] = m256_neg1;
-   kc->w[20] = mm256_neg1;
+   kc->w[20] = m256_neg1;
   kc->ptr = 0;
   kc->lim = 200 - (out_size >> 2);
 }
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -0,0 +1,584 @@
 /*
 * luffa_for_sse2.c
 * Version 2.0 (Sep 15th 2009)
 *
 * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
 *
 * Hitachi, Ltd. is the owner of this software and hereby grant
 * the U.S. Government and any interested party the right to use
 * this software for the purposes of the SHA-3 evaluation process,
 * notwithstanding that this software is copyrighted.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
 #include <string.h>
 #include <immintrin.h>
 #include "luffa-hash-2way.h"
 #if defined(__AVX2__)
 #include "avxdefs.h"
 #define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
                               0UL, 0UL, 0UL, 0xffffffffUL )
 #define ADD_CONSTANT(a,b,c0,c1)\
    a = _mm256_xor_si256(a,c0);\
    b = _mm256_xor_si256(b,c1);\
 #define MULT2(a0,a1) \
 do { \
  __m256i b = _mm256_xor_si256( a0, \
                   _mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
 } while(0)
 // confirm pointer arithmetic
 // ok but use array indexes
 #define STEP_PART(x,c,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
    MIXWORD(*x,*(x+4),*t,*(t+1));\
    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
    ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
 #define SUBCRUMB(a0,a1,a2,a3,t)\
    t  = _mm256_load_si256(&a0);\
    a0 = _mm256_or_si256(a0,a1);\
    a2 = _mm256_xor_si256(a2,a3);\
    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
    a0 = _mm256_xor_si256(a0,a3);\
    a3 = _mm256_and_si256(a3,t);\
    a1 = _mm256_xor_si256(a1,a3);\
    a3 = _mm256_xor_si256(a3,a2);\
    a2 = _mm256_and_si256(a2,a0);\
    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
    a2 = _mm256_xor_si256(a2,a1);\
    a1 = _mm256_or_si256(a1,a3);\
    t  = _mm256_xor_si256(t,a1);\
    a3 = _mm256_xor_si256(a3,a2);\
    a2 = _mm256_and_si256(a2,a1);\
    a1 = _mm256_xor_si256(a1,a0);\
    a0 = _mm256_load_si256(&t);\
 #define MIXWORD(a,b,t1,t2)\
    b  = _mm256_xor_si256(a,b);\
    t1 = _mm256_slli_epi32(a,2);\
    t2 = _mm256_srli_epi32(a,30);\
     a = _mm256_or_si256(t1,t2);\
    a  = _mm256_xor_si256(a,b);\
    t1 = _mm256_slli_epi32(b,14);\
    t2 = _mm256_srli_epi32(b,18);\
    b  = _mm256_or_si256(t1,t2);\
    b  = _mm256_xor_si256(a,b);\
    t1 = _mm256_slli_epi32(a,10);\
    t2 = _mm256_srli_epi32(a,22);\
    a  = _mm256_or_si256(t1,t2);\
    a  = _mm256_xor_si256(a,b);\
    t1 = _mm256_slli_epi32(b,1);\
    t2 = _mm256_srli_epi32(b,31);\
    b  = _mm256_or_si256(t1,t2);
 #define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
    a1 = _mm256_shuffle_epi32(a1,147);\
    t0 = _mm256_load_si256(&a1);\
    a1 = _mm256_unpacklo_epi32(a1,a0);\
    t0 = _mm256_unpackhi_epi32(t0,a0);\
    t1 = _mm256_shuffle_epi32(t0,78);\
    a0 = _mm256_shuffle_epi32(a1,78);\
    SUBCRUMB(t1,t0,a0,a1,tmp0);\
    t0 = _mm256_unpacklo_epi32(t0,t1);\
    a1 = _mm256_unpacklo_epi32(a1,a0);\
    a0 = _mm256_load_si256(&a1);\
    a0 = _mm256_unpackhi_epi64(a0,t0);\
    a1 = _mm256_unpacklo_epi64(a1,t0);\
    a1 = _mm256_shuffle_epi32(a1,57);\
    MIXWORD(a0,a1,tmp0,tmp1);\
    ADD_CONSTANT(a0,a1,c0,c1);
 #define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
    s2 = _mm256_load_si256(&r1);\
    q2 = _mm256_load_si256(&p1);\
    r2 = _mm256_shuffle_epi32(r2,216);\
    p2 = _mm256_shuffle_epi32(p2,216);\
    r1 = _mm256_unpacklo_epi32(r1,r0);\
    p1 = _mm256_unpacklo_epi32(p1,p0);\
    s2 = _mm256_unpackhi_epi32(s2,r0);\
    q2 = _mm256_unpackhi_epi32(q2,p0);\
    s0 = _mm256_load_si256(&r2);\
    q0 = _mm256_load_si256(&p2);\
    r2 = _mm256_unpacklo_epi64(r2,r1);\
    p2 = _mm256_unpacklo_epi64(p2,p1);\
    s1 = _mm256_load_si256(&s0);\
    q1 = _mm256_load_si256(&q0);\
    s0 = _mm256_unpackhi_epi64(s0,r1);\
    q0 = _mm256_unpackhi_epi64(q0,p1);\
    r2 = _mm256_shuffle_epi32(r2,225);\
    p2 = _mm256_shuffle_epi32(p2,225);\
    r0 = _mm256_load_si256(&s1);\
    p0 = _mm256_load_si256(&q1);\
    s0 = _mm256_shuffle_epi32(s0,225);\
    q0 = _mm256_shuffle_epi32(q0,225);\
    s1 = _mm256_unpacklo_epi64(s1,s2);\
    q1 = _mm256_unpacklo_epi64(q1,q2);\
    r0 = _mm256_unpackhi_epi64(r0,s2);\
    p0 = _mm256_unpackhi_epi64(p0,q2);\
    s2 = _mm256_load_si256(&r0);\
    q2 = _mm256_load_si256(&p0);\
    s3 = _mm256_load_si256(&r2);\
    q3 = _mm256_load_si256(&p2);\
 #define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
    s0 = _mm256_load_si256(&r0);\
    q0 = _mm256_load_si256(&p0);\
    s1 = _mm256_load_si256(&r2);\
    q1 = _mm256_load_si256(&p2);\
    r0 = _mm256_unpackhi_epi32(r0,r1);\
    p0 = _mm256_unpackhi_epi32(p0,p1);\
    r2 = _mm256_unpackhi_epi32(r2,r3);\
    p2 = _mm256_unpackhi_epi32(p2,p3);\
    s0 = _mm256_unpacklo_epi32(s0,r1);\
    q0 = _mm256_unpacklo_epi32(q0,p1);\
    s1 = _mm256_unpacklo_epi32(s1,r3);\
    q1 = _mm256_unpacklo_epi32(q1,p3);\
    r1 = _mm256_load_si256(&r0);\
    p1 = _mm256_load_si256(&p0);\
    r0 = _mm256_unpackhi_epi64(r0,r2);\
    p0 = _mm256_unpackhi_epi64(p0,p2);\
    s0 = _mm256_unpackhi_epi64(s0,s1);\
    q0 = _mm256_unpackhi_epi64(q0,q1);\
    r1 = _mm256_unpacklo_epi64(r1,r2);\
    p1 = _mm256_unpacklo_epi64(p1,p2);\
    s2 = _mm256_load_si256(&r0);\
    q2 = _mm256_load_si256(&p0);\
    s1 = _mm256_load_si256(&r1);\
    q1 = _mm256_load_si256(&p1);\
 #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    s1 = _mm256_load_si256(&r3);\
    q1 = _mm256_load_si256(&p3);\
    s3 = _mm256_load_si256(&r3);\
    q3 = _mm256_load_si256(&p3);\
    s1 = _mm256_unpackhi_epi32(s1,r2);\
    q1 = _mm256_unpackhi_epi32(q1,p2);\
    s3 = _mm256_unpacklo_epi32(s3,r2);\
    q3 = _mm256_unpacklo_epi32(q3,p2);\
    s0 = _mm256_load_si256(&s1);\
    q0 = _mm256_load_si256(&q1);\
    s2 = _mm256_load_si256(&s3);\
    q2 = _mm256_load_si256(&q3);\
    r3 = _mm256_load_si256(&r1);\
    p3 = _mm256_load_si256(&p1);\
    r1 = _mm256_unpacklo_epi32(r1,r0);\
    p1 = _mm256_unpacklo_epi32(p1,p0);\
    r3 = _mm256_unpackhi_epi32(r3,r0);\
    p3 = _mm256_unpackhi_epi32(p3,p0);\
    s0 = _mm256_unpackhi_epi64(s0,r3);\
    q0 = _mm256_unpackhi_epi64(q0,p3);\
    s1 = _mm256_unpacklo_epi64(s1,r3);\
    q1 = _mm256_unpacklo_epi64(q1,p3);\
    s2 = _mm256_unpackhi_epi64(s2,r1);\
    q2 = _mm256_unpackhi_epi64(q2,p1);\
    s3 = _mm256_unpacklo_epi64(s3,r1);\
    q3 = _mm256_unpacklo_epi64(q3,p1);
 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
 /* initial values of chaining variables */
 static const uint32 IV[40] __attribute((aligned(32))) = {
    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
 };
 /* Round Constants */
 static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
    0x00000000,0x00000000,0x00000000,0x5090d577,
    0x00000000,0x00000000,0x00000000,0xac11d7fa,
    0x00000000,0x00000000,0x00000000,0x2d1925ab,
    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
    0x00000000,0x00000000,0x00000000,0xb46496ac,
    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
    0x00000000,0x00000000,0x00000000,0xd1925ab0,
    0x00000000,0x00000000,0x00000000,0x78602649,
    0x00000000,0x00000000,0x00000000,0x29131ab6,
    0x00000000,0x00000000,0x00000000,0x8edae952,
    0x00000000,0x00000000,0x00000000,0x0fc053c3,
    0x00000000,0x00000000,0x00000000,0x3b6ba548,
    0x00000000,0x00000000,0x00000000,0x3f014f0c,
    0x00000000,0x00000000,0x00000000,0xedae9520,
    0x00000000,0x00000000,0x00000000,0xfc053c31
 };
 __m256i CNS[32];
 /***************************************************/
 /* Round function         */
 /* state: hash context    */
 void rnd512_2way( luffa_2way_context *state, __m256i *msg )
 {
    __m256i t[2];
    __m256i *chainv = state->chainv;
    __m256i msg0, msg1;
    __m256i tmp[2];
    __m256i x[8];
    t[0] = chainv[0];
    t[1] = chainv[1];
    t[0] = _mm256_xor_si256( t[0], chainv[2] );
    t[1] = _mm256_xor_si256( t[1], chainv[3] );
    t[0] = _mm256_xor_si256( t[0], chainv[4] );
    t[1] = _mm256_xor_si256( t[1], chainv[5] );
    t[0] = _mm256_xor_si256( t[0], chainv[6] );
    t[1] = _mm256_xor_si256( t[1], chainv[7] );
    t[0] = _mm256_xor_si256( t[0], chainv[8] );
    t[1] = _mm256_xor_si256( t[1], chainv[9] );
    MULT2( t[0], t[1] );
    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
    chainv[0] = _mm256_xor_si256( chainv[0], t[0] );
    chainv[1] = _mm256_xor_si256( chainv[1], t[1] );
    chainv[2] = _mm256_xor_si256( chainv[2], t[0] );
    chainv[3] = _mm256_xor_si256( chainv[3], t[1] );
    chainv[4] = _mm256_xor_si256( chainv[4], t[0] );
    chainv[5] = _mm256_xor_si256( chainv[5], t[1] );
    chainv[6] = _mm256_xor_si256( chainv[6], t[0] );
    chainv[7] = _mm256_xor_si256( chainv[7], t[1] );
    chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
    chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
    t[0] = chainv[0];
    t[1] = chainv[1];
    MULT2( chainv[0], chainv[1]);
    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
    MULT2( chainv[2], chainv[3]);
    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
    MULT2( chainv[4], chainv[5]);
    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
    MULT2( chainv[6], chainv[7]);
    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
    MULT2( chainv[8], chainv[9]);
    chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
    chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
    t[0] = chainv[8];
    t[1] = chainv[9];
    MULT2( chainv[8], chainv[9]);
    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
    MULT2( chainv[6], chainv[7]);
    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
    MULT2( chainv[4], chainv[5]);
    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
    MULT2( chainv[2], chainv[3] );
    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
    MULT2( chainv[0], chainv[1] );
    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t[0] ), msg0 );
    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t[1] ), msg1 );
    MULT2( msg0, msg1);
    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
    MULT2( msg0, msg1);
    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
    MULT2( msg0, msg1);
    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
    MULT2( msg0, msg1);
    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
    MULT2( msg0, msg1);
    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
                                 _mm256_srli_epi32( chainv[3], 31 ) );
    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
                                 _mm256_srli_epi32( chainv[5], 30 ) );
    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
                                 _mm256_srli_epi32( chainv[7], 29 ) );
    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
                                 _mm256_srli_epi32( chainv[9], 28 ) );
    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
                x[0], x[1], x[2], x[3],
                chainv[1],chainv[3],chainv[5],chainv[7],
                x[4], x[5], x[6], x[7] );
    STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
    STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
    STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
    STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
    STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
    STEP_PART( &x[0], &CNS[10], &tmp[0] );
    STEP_PART( &x[0], &CNS[12], &tmp[0] );
    STEP_PART( &x[0], &CNS[14], &tmp[0] );
    MIXTON1024( x[0], x[1], x[2], x[3],
                chainv[0], chainv[2], chainv[4],chainv[6],
                x[4], x[5], x[6], x[7],
                chainv[1],chainv[3],chainv[5],chainv[7]);
    /* Process last 256-bit block */
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[16], CNS[17],
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[18], CNS[19],
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[20], CNS[21],
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[22], CNS[23],
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[24], CNS[25],
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[26], CNS[27],
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[28], CNS[29],
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[30], CNS[31],
                tmp[0], tmp[1] );
 }
 /***************************************************/
 /* Finalization function  */
 /* state: hash context    */
 /* b[8]: hash values      */
 void finalization512_2way( luffa_2way_context *state, uint32 *b )
 {
    uint32 hash[8] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t[2];
    __m256i zero[2];
    zero[0] = zero[1] = _mm256_setzero_si256();
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, zero );
    t[0] = chainv[0];
    t[1] = chainv[1];
    t[0] = _mm256_xor_si256( t[0], chainv[2] );
    t[1] = _mm256_xor_si256( t[1], chainv[3] );
    t[0] = _mm256_xor_si256( t[0], chainv[4] );
    t[1] = _mm256_xor_si256( t[1], chainv[5] );
    t[0] = _mm256_xor_si256( t[0], chainv[6] );
    t[1] = _mm256_xor_si256( t[1], chainv[7] );
    t[0] = _mm256_xor_si256( t[0], chainv[8] );
    t[1] = _mm256_xor_si256( t[1], chainv[9] );
    t[0] = _mm256_shuffle_epi32( t[0], 27 );
    t[1] = _mm256_shuffle_epi32( t[1], 27 );
    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
    rnd512_2way( state, zero );
    t[0] = chainv[0];
    t[1] = chainv[1];
    t[0] = _mm256_xor_si256( t[0], chainv[2] );
    t[1] = _mm256_xor_si256( t[1], chainv[3] );
    t[0] = _mm256_xor_si256( t[0], chainv[4] );
    t[1] = _mm256_xor_si256( t[1], chainv[5] );
    t[0] = _mm256_xor_si256( t[0], chainv[6] );
    t[1] = _mm256_xor_si256( t[1], chainv[7] );
    t[0] = _mm256_xor_si256( t[0], chainv[8] );
    t[1] = _mm256_xor_si256( t[1], chainv[9] );
    t[0] = _mm256_shuffle_epi32( t[0], 27 );
    t[1] = _mm256_shuffle_epi32( t[1], 27 );
    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
 }
 int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
 {
    int i;
    state->hashbitlen = hashbitlen;
    for ( i=0; i<32; i++ ) CNS[i] =
          _mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ],
                            CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ] );
    for ( i=0; i<10; i++ ) state->chainv[i] =
          _mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
                            IV[ (i<<2) +1 ], IV[ (i<<2)    ],
                            IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
                            IV[ (i<<2) +1 ], IV[ (i<<2)    ] );
    ((__m256i*)state->buffer)[0] = m256_zero;
    ((__m256i*)state->buffer)[1] = m256_zero;
    return 0;
 }
 // Do not call luffa_update_close after having called luffa_update.
 // Once luffa_update has been called only call luffa_update or luffa_close.
 int luffa_2way_update( luffa_2way_context *state, const void *data,
                       size_t len )
 {
    __m256i *vdata  = (__m256i*)data;
    __m256i *buffer = (__m256i*)state->buffer;
    __m256i msg[2];
    int i;
    int blocks = (int)len / 32;
    state-> rembytes = (int)len % 32;
    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
       msg[0] = mm256_bswap_32( vdata[ i   ] );
       msg[1] = mm256_bswap_32( vdata[ i+1 ] );
       rnd512_2way( state, msg );
    }
    // 16 byte partial block exists for 80 byte len
    // store in buffer for transform in final for midstate to work
    if ( state->rembytes  )
    {
      // remaining data bytes
      buffer[0] = mm256_bswap_32( vdata[0] );
      buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
                                   0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
    }
    return 0;
 }
 int luffa_2way_close( luffa_2way_context *state, void *hashval )
 {
    __m256i *buffer = (__m256i*)state->buffer;
    __m256i msg[2];
    // transform pad block
    if ( state->rembytes )
      // not empty, data is in buffer
      rnd512_2way( state, buffer );
    else
    {     // empty pad block, constant data
      msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
                                0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
      msg[1] = m256_zero;
      rnd512_2way( state, msg );
    }
    finalization512_2way( state, (uint32*)hashval );
    if ( state->hashbitlen > 512 )
        finalization512_2way( state, (uint32*)( hashval+128 ) );
    return 0;
 }
 int luffa_2way_update_close( luffa_2way_context *state,
                 void *output, const void *data, size_t inlen )
 {
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
    const __m256i *vdata  = (__m256i*)data;
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
    state->rembytes = inlen & 0x1F;
    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
       msg[0] = mm256_bswap_32( vdata[ 0 ] );
       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }
    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
    {
       // padding of partial block
       msg[0] = mm256_bswap_32( vdata[0] );
       msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
                                 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
       rnd512_2way( state, msg );
    }
    else
    {
       // empty pad block
       msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
                                 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
       msg[1] = m256_zero;
       rnd512_2way( state, msg );
    }
    finalization512_2way( state, (uint32*)output );
    if ( state->hashbitlen > 512 )
        finalization512_2way( state, (uint32*)( output+128 ) );
    return 0;
 }
 #endif
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -0,0 +1,69 @@
 #if !defined(LUFFA_HASH_2WAY_H__)
 #define LUFFA_HASH_2WAY_H__ 1
 /*
 * luffa_for_sse2.h
 * Version 2.0 (Sep 15th 2009)
 *
 * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
 *
 * Hitachi, Ltd. is the owner of this software and hereby grant
 * the U.S. Government and any interested party the right to use
 * this software for the purposes of the SHA-3 evaluation process,
 * notwithstanding that this software is copyrighted.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
 #if defined(__AVX2__)
 #include <immintrin.h>
 #include "algo/sha/sha3-defs.h"
 #include "avxdefs.h"
 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
 #define DIGEST_BIT_LEN_256 256
 #define DIGEST_BIT_LEN_384 384
 #define DIGEST_BIT_LEN_512 512
 /*********************************/
 /* The parameters of Luffa       */
 #define MSG_BLOCK_BIT_LEN 256  /*The bit length of a message block*/
 #define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
                                                     * of a message block*/
 /* The number of blocks in Luffa */
 #define WIDTH_224 3
 #define WIDTH_256 3
 #define WIDTH_384 4
 #define WIDTH_512 5
 /* The limit of the length of message */
 #define LIMIT_224 64
 #define LIMIT_256 64
 #define LIMIT_384 128
 #define LIMIT_512 128
 /*********************************/
 typedef struct {
    uint32 buffer[8*2] __attribute((aligned(64)));
    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
    int hashbitlen;
    int rembytes;
 } luffa_2way_context;
 int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
 int luffa_2way_update( luffa_2way_context *state, const void *data,
                       size_t len );
 int luffa_2way_close( luffa_2way_context *state, void *hashval );
 int luffa_2way_update_close( luffa_2way_context *state, void *output,
                                   const void *data, size_t inlen );
 #endif
 #endif
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_byteswap_32( cast_m128i( data ) ) );
+                      mm_bswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm256_store_si256( (__m256i*)hash, t );
-    casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
    rnd512( state, zero, zero );
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm256_store_si256( (__m256i*)hash, t );
-    casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 }
 #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);
-    casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
    rnd512( state, zero, zero );
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);
-    casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
 }
 #endif
--- a/algo/luffa/sse2/luffa_for_sse2.h
+++ b/algo/luffa/sse2/luffa_for_sse2.h
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input )
    blake512_4way_close( &ctx.blake, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input )
    jh512_4way_close( &ctx.jh, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input )
    skein512_4way_close( &ctx.skein, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input )
    bmw512_4way_close( &ctx.bmw, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input )
    jh512_4way_close( &ctx.jh, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input )
    skein512_4way_close( &ctx.skein, vhash );
    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -0,0 +1,130 @@
 #include "deep-gate.h"
 #if defined(DEEP_2WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"
 typedef struct
 {
        luffa_2way_context      luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
        hashState_echo          echo;
 } deep_2way_ctx_holder;
 deep_2way_ctx_holder deep_2way_ctx;
 void init_deep_2way_ctx()
 {
        luffa_2way_init( &deep_2way_ctx.luffa, 512 );
        cubehashInit(&deep_2way_ctx.cube,512,16,32);
        sph_shavite512_init(&deep_2way_ctx.shavite);
        init_echo(&deep_2way_ctx.echo, 512);
 };
 void deep_2way_hash( void *output, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
     deep_2way_ctx_holder ctx;
     memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
     luffa_2way_close( &ctx.luffa, vhash );
     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
                           (const byte*) hash0, 64 );
     memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
 }
 int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     bool *found = work->nfound;
     int num_found = 0;
     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
     uint32_t *noncep1 = vdata + 32+7;
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
     // big endian encode 0..18 uint32_t, 64 bits at a time
     swab32_array( endiandata, pdata, 20 );
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
     luffa_2way_init( &deep_2way_ctx.luffa, 512 );
     luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
     {
        uint32_t mask = masks[m];
        do
        {
            found[0] = found[1] = false;
            be32enc( noncep0, n   );
            be32enc( noncep1, n+1 );
            deep_2way_hash( hash, vdata );
            pdata[19] = n;
            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
            {
               found[0] = true;
               num_found++;
               nonces[0] = n;
               work_set_target_ratio( work, hash );
            }
            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
            {
               found[1] = true;
               num_found++;
               nonces[1] = n+1;
               work_set_target_ratio( work, hash+8 );
            }
            n += 2;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
                   && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
     return num_found;
 }
 #endif
--- a/algo/qubit/deep-gate.c
+++ b/algo/qubit/deep-gate.c
@@ -0,0 +1,17 @@
 #include "deep-gate.h"
 bool register_deep_algo( algo_gate_t* gate )
 {
 #if defined (DEEP_2WAY)
  init_deep_2way_ctx();
  gate->scanhash  = (void*)&scanhash_deep_2way;
  gate->hash      = (void*)&deep_2way_hash;
 #else
  init_deep_ctx();
  gate->scanhash  = (void*)&scanhash_deep;
  gate->hash      = (void*)&deep_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/qubit/deep-gate.h
+++ b/algo/qubit/deep-gate.h
@@ -0,0 +1,32 @@
 #ifndef DEEP_GATE_H__
 #define DEEP_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(__AVX2__) && defined(__AES__)
  #define DEEP_2WAY
 #endif
 bool register_deep_algo( algo_gate_t* gate );
 #if defined(DEEP_2WAY)
 void deep_2way_hash( void *state, const void *input );
 int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 void init_deep_2way_ctx();
 #endif
 void deep_hash( void *state, const void *input );
 int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_deep_ctx();
 #endif
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,9 +1,9 @@
-#include "algo-gate-api.h"
+#include "deep-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
@@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 bool register_deep_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  init_deep_ctx();
  gate->scanhash = (void*)&scanhash_deep;
  gate->hash     = (void*)&deep_hash;
  return true;
 };
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -0,0 +1,138 @@
 #include "qubit-gate.h"
 #if defined(QUBIT_2WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"
 typedef struct
 {
        luffa_2way_context      luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
        simd_2way_context       simd;
        hashState_echo          echo;
 } qubit_2way_ctx_holder;
 qubit_2way_ctx_holder qubit_2way_ctx;
 void init_qubit_2way_ctx()
 {
        luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
        cubehashInit(&qubit_2way_ctx.cube,512,16,32);
        sph_shavite512_init(&qubit_2way_ctx.shavite);
        simd_2way_init( &qubit_2way_ctx.simd, 512 );
        init_echo(&qubit_2way_ctx.echo, 512);
 };
 void qubit_2way_hash( void *output, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
     qubit_2way_ctx_holder ctx;
     memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
     luffa_2way_close( &ctx.luffa, vhash );
     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
                           (const byte*) hash0, 64 );
     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
 }
 int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t *nonces = work->nonces;
     bool *found = work->nfound;
     int num_found = 0;
     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
     uint32_t *noncep1 = vdata + 32+7;
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
     // big endian encode 0..18 uint32_t, 64 bits at a time
     swab32_array( endiandata, pdata, 20 );
     uint64_t *edata = (uint64_t*)endiandata;
     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
     luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
     luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
     {
        uint32_t mask = masks[m];
        do
        {
           found[0] = found[1] = false;
            be32enc( noncep0, n   );
            be32enc( noncep1, n+1 );
            qubit_2way_hash( hash, vdata );
            pdata[19] = n;
            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
            {
               found[0] = true;
               num_found++;
               nonces[0] = n;
               work_set_target_ratio( work, hash );
            }
            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
            {
               found[1] = true;
               num_found++;
               nonces[1] = n+1;
               work_set_target_ratio( work, hash+8 );
            }
            n += 2;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
                   && !work_restart[thr_id].restart );
         break;
     }
     *hashes_done = n - first_nonce + 1;
     return num_found;
 }
 #endif
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -0,0 +1,17 @@
 #include "qubit-gate.h"
 bool register_qubit_algo( algo_gate_t* gate )
 {
 #if defined (QUBIT_2WAY)
  init_qubit_2way_ctx();
  gate->scanhash  = (void*)&scanhash_qubit_2way;
  gate->hash      = (void*)&qubit_2way_hash;
 #else
  init_qubit_ctx();
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -0,0 +1,32 @@
 #ifndef QUBIT_GATE_H__
 #define QUBIT_GATE_H__ 1
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(__AVX2__) && defined(__AES__)
  #define QUBIT_2WAY
 #endif
 bool register_qubit_algo( algo_gate_t* gate );
 #if defined(QUBIT_2WAY)
 void qubit_2way_hash( void *state, const void *input );
 int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 void init_qubit_2way_ctx();
 #endif
 void qubit_hash( void *state, const void *input );
 int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_qubit_ctx();
 #endif
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,11 +1,11 @@
-#include "algo-gate-api.h"
+#include "qubit-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
@@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
    update_luffa( &qubit_luffa_mid, input, 64 );
 }
-void qubithash(void *output, const void *input)
+void qubit_hash(void *output, const void *input)
 {
        unsigned char hash[128] __attribute((aligned(64)));
        #define hashB hash+64
@@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
                {
 	            pdata[19] = ++n;
 		    be32enc(&endiandata[19], n);
-		    qubithash(hash64, endiandata);
+		    qubit_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 		    if (!(hash64[7] & mask))
                    {
@@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
 	return 0;
 }
 bool register_qubit_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  init_qubit_ctx();
  gate->scanhash = (void*)&scanhash_qubit;
  gate->hash     = (void*)&qubithash;
  return true;
 };
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )
 bool register_scrypt_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
 //  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
 #if defined BE64
 #if defined PLW1
    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #elif defined PLW4
    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #else
    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #endif  // PLW
 #else  // LE64
 #if defined PLW1
@@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
    for ( u = 0; u < rnum; u ++ )
    {
 #if defined BE64
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 #else  // LE64
       ((__m256i*)dst)[u] = sc->val[u];
 #endif
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -30,13 +30,235 @@
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #if defined(__AVX__)
 #include <stddef.h>
 #include <string.h>
 #include "sha2-hash-4way.h"
 // SHA256 4 way 32 bit
 static const sph_u32 H256[8] = {
        SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
        SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
        SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
        SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
 };
 static const sph_u32 K256[80] = {
        SPH_C32(0x428A2F98), SPH_C32(0x71374491),
        SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
        SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
        SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
        SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
        SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
        SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
        SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
        SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
        SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
        SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
        SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
        SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
        SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
        SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
        SPH_C32(0x06CA6351), SPH_C32(0x14292967),
        SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
        SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
        SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
        SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
        SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
        SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
        SPH_C32(0xD192E819), SPH_C32(0xD6990624),
        SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
        SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
        SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
        SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
        SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
        SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
        SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
        SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
        SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2),
        SPH_C32(0xCA273ECE), SPH_C32(0xD186B8C7),
        SPH_C32(0xEADA7DD6), SPH_C32(0xF57D4F7F),
        SPH_C32(0x06F067AA), SPH_C32(0x0A637DC5),
        SPH_C32(0x113F9804), SPH_C32(0x1B710B35),
        SPH_C32(0x28DB77F5), SPH_C32(0x32CAAB7B),
        SPH_C32(0x3C9EBE0A), SPH_C32(0x431D67C4),
        SPH_C32(0x4CC5D4BE), SPH_C32(0x597F299C),
        SPH_C32(0x5FCB6FAB), SPH_C32(0x6C44198C)
 };
 #define CHs(X, Y, Z) \
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
 #define MAJs(X, Y, Z) \
   _mm_or_si128( _mm_and_si128( X, Y ), \
                    _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
 #define BSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
        mm_rotr_32(x,  2), mm_rotr_32(x, 13) ), mm_rotr_32( x, 22) )
 #define BSG2_1(x) \
   _mm_xor_si128( _mm_xor_si128( \
        mm_rotr_32(x,  6), mm_rotr_32(x, 11) ), mm_rotr_32( x, 25) )
 #define SSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
        mm_rotr_32(x,  7), mm_rotr_32(x, 18) ), _mm_srli_epi32(x, 3) ) 
 #define SSG2_1(x) \
   _mm_xor_si128( _mm_xor_si128( \
        mm_rotr_32(x, 17), mm_rotr_32(x, 19) ), _mm_srli_epi32(x, 10) )
 #define SHA256_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
  __m128i T1, T2; \
  T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
       _mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
                         _mm_set1_epi32( K256[i] ) ), W[i] ); \
  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
  D  = _mm_add_epi32( D, T1 ); \
  H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
 static void
 sha256_4way_round( __m128i *in, __m128i r[8] )
 {
   int i;
   __m128i A, B, C, D, E, F, G, H;
   __m128i W[80];
   for ( i = 0; i < 16; i++ )
      W[i] = mm_bswap_32( in[i] );
   for ( i = 16; i < 80; i++ )
      W[i] = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32(
           SSG2_1( W[ i-2 ] ), W[ i-7 ] ), SSG2_0( W[ i-15 ] ) ), W[ i-16 ] );
   A = r[0];
   B = r[1];
   C = r[2];
   D = r[3];
   E = r[4];
   F = r[5];
   G = r[6];
   H = r[7];
   for ( i = 0; i < 80; i += 8 )
   {
      SHA256_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
      SHA256_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
      SHA256_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
      SHA256_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
      SHA256_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
      SHA256_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
      SHA256_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
      SHA256_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
   }
   r[0] = _mm_add_epi32( r[0], A );
   r[1] = _mm_add_epi32( r[1], B );
   r[2] = _mm_add_epi32( r[2], C );
   r[3] = _mm_add_epi32( r[3], D );
   r[4] = _mm_add_epi32( r[4], E );
   r[5] = _mm_add_epi32( r[5], F );
   r[6] = _mm_add_epi32( r[6], G );
   r[7] = _mm_add_epi32( r[7], H );
 }
 void sha256_4way_init( sha256_4way_context *sc )
 {
   sc->count_high = sc->count_low = 0;
   sc->val[0] = _mm_set1_epi32( H256[0] );
   sc->val[1] = _mm_set1_epi32( H256[1] );
   sc->val[2] = _mm_set1_epi32( H256[2] );
   sc->val[3] = _mm_set1_epi32( H256[3] );
   sc->val[4] = _mm_set1_epi32( H256[4] );
   sc->val[5] = _mm_set1_epi32( H256[5] );
   sc->val[6] = _mm_set1_epi32( H256[6] );
   sc->val[7] = _mm_set1_epi32( H256[7] );
 }
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
 {
   __m128i *vdata = (__m128i*)data;
   size_t ptr;
   const int buf_size = 64;
   ptr = (unsigned)sc->count_low & (buf_size - 1U);
   while ( len > 0 )
   {
      size_t clen;
      uint32_t clow, clow2;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
      memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
      vdata = vdata + (clen>>2);
      ptr += clen;
      len -= clen;
      if ( ptr == buf_size )
      {
         sha256_4way_round( sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
      clow2 = SPH_T32( clow + clen );
      sc->count_low = clow2;
      if ( clow2 < clow )
         sc->count_high++;
   }
 }
 void sha256_4way_close( sha256_4way_context *sc, void *dst )
 {
    unsigned ptr, u;
    uint32_t low, high;
    const int buf_size = 64;
    const int pad = buf_size - 8;
    ptr = (unsigned)sc->count_low & (buf_size - 1U);
    sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
    ptr += 4;
    if ( ptr > pad )
    {
         memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
         sha256_4way_round( sc->buf, sc->val );
         memset_zero_128( sc->buf, pad >> 2 );
    }
    else
         memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
    low = sc->count_low;
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;
    sc->buf[ pad >> 2 ] =
                 mm_bswap_32( _mm_set1_epi32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm_bswap_32( _mm_set1_epi32( low ) );
    sha256_4way_round( sc->buf, sc->val );
    for ( u = 0; u < 8; u ++ )
       ((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
 }
 #if defined(__AVX2__)
 // SHA512 4 way 64 bit
 static const sph_u64 H512[8] = {
        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
        SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
        SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
        SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
 };
 static const sph_u64 K512[80] = {
 	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
 	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
@@ -80,13 +302,6 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };
 static const sph_u64 H512[8] = {
 	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
 	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
 	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
 	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
 };
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
@@ -129,7 +344,7 @@ sha512_4way_round( __m256i *in, __m256i r[8] )
   __m256i W[80];
   for ( i = 0; i < 16; i++ )
-      W[i] = mm256_byteswap_64( in[i] );
+      W[i] = mm256_bswap_64( in[i] );
   for ( i = 16; i < 80; i++ )
      W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
           SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
@@ -182,7 +397,7 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
-   int buf_size = 128;
+   const int buf_size = 128;
   ptr = (unsigned)sc->count & (buf_size - 1U);
   while ( len > 0 )
@@ -207,8 +422,8 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
 void sha512_4way_close( sha512_4way_context *sc, void *dst )
 {
    unsigned ptr, u;
-    int buf_size = 128;
+    const int buf_size = 128;
-    int pad = buf_size - 16;
+    const int pad = buf_size - 16;
    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
@@ -224,13 +439,14 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
    sc->buf[ pad >> 3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ ( pad+8 ) >> 3 ] = 
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
    sha512_4way_round( sc->buf, sc->val );
    for ( u = 0; u < 8; u ++ )
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 }
-#endif
+#endif  // __AVX2__
 #endif  // __AVX__
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -44,47 +44,19 @@
 #include "sph_types.h"
 #include "avxdefs.h"
-#if 0
+#if defined(__AVX__)
 #define SPH_SIZE_sha224   224
 #define SPH_SIZE_sha256   256
 typedef struct {
-#ifndef DOXYGEN_IGNORE
+   __m128i buf[64>>2];
-	unsigned char buf[64];    /* first field, for alignment */
+   __m128i val[8];
-	sph_u32 val[8];
+   uint32_t count_high, count_low;
-#if SPH_64
+} sha256_4way_context;
 	sph_u64 count;
 #else
 	sph_u32 count_high, count_low;
 #endif
 #endif
 } sph_sha224_context;
-typedef sph_sha224_context sph_sha256_context;
+void sha256_4way_init( sha256_4way_context *sc );
-
+void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
-void sph_sha224_init(void *cc);
+void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sph_sha224(void *cc, const void *data, size_t len);
 void sph_sha224_close(void *cc, void *dst);
 void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
 void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
 void sph_sha256_init(void *cc);
 void sph_sha256(void *cc, const void *data, size_t len);
 void sph_sha256_close(void *cc, void *dst);
 void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
 void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
 #endif
 #if defined (__AVX2__)
@@ -102,3 +74,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst );
 #endif
 #endif
 #endif
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -74,6 +74,18 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };
 // Return hi 128 bits with elements shifted one lane with vacated lane filled
 // with data rotated from lo.
 // Partially rotate elements in two 128 bit vectors as one 256 bit vector
 // and return the rotated high 128 bits.
 // Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
 // completed. It's faster than a full rotation.
 static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
 {   return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
                        _mm_slli_si128( lo, 16 - (n<<2) ) );
 }
 #define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
 		sph_u32 t0 = (x0); \
 		sph_u32 t1 = (x1); \
@@ -284,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
   // round
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   k01 = m[1];
   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   k02 = m[2];
   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   k03 = m[3];
   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p0 = _mm_xor_si128( p0, x );
   k10 = m[4];
   x = _mm_xor_si128( p3, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   k11 = m[5];
   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   k12 = m[6];
   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   k13 = m[7];
   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p2 = _mm_xor_si128( p2, x );
   for ( r = 0; r < 3; r ++ )
   {
      // round 1, 5, 9
-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 ); 
      if ( r == 0 )
@@ -327,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
      x = _mm_xor_si128( p0, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );
      if ( r == 1 )
@@ -336,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );
      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );
      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );
      x = _mm_xor_si128( p2, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );
      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );
      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );
      if ( r == 2 )
@@ -371,89 +383,89 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );
      // round 2, 6, 10
      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
      x = _mm_xor_si128( p3, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p2 = _mm_xor_si128( p2, x );
      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
      x = _mm_xor_si128( p1, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );
      // round 3, 7, 11
-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 );
      x = _mm_xor_si128( p2, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );
      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );
      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );
      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );
      x = _mm_xor_si128( p0, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );
      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );
      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );
      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );
      // round 4, 8, 12
@@ -461,83 +473,83 @@ c512( sph_shavite_big_context *sc, const void *msg )
      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
      x = _mm_xor_si128( p1, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );
      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
      x = _mm_xor_si128( p3, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p2 = _mm_xor_si128( p2, x );
   }
   // round 13
-   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
   k00 = _mm_xor_si128( k00, k13 );
   x = _mm_xor_si128( p0, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
-   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); 
+   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
   k01 = _mm_xor_si128( k01, k00 );
   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
-   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
   k02 = _mm_xor_si128( k02, k01 );
   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
-   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
   k03 = _mm_xor_si128( k03, k02 );
   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p3 = _mm_xor_si128( p3, x );
-   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
   k10 = _mm_xor_si128( k10, k03 );
   x = _mm_xor_si128( p2, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
-   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
   k11 = _mm_xor_si128( k11, k10 );
   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
-   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
-   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
   k13 = _mm_xor_si128( k13, k12 );
   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p1 = _mm_xor_si128( p1, x );
   h[0] = _mm_xor_si128( h[0], p2 );
--- a/algo/simd/sse2/nist.c
+++ b/algo/simd/sse2/nist.c
--- a/algo/simd/sse2/nist.h
+++ b/algo/simd/sse2/nist.h
--- a/algo/simd/sse2/simd-compat.h
+++ b/algo/simd/sse2/simd-compat.h
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -0,0 +1,853 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "simd-hash-2way.h"
 #if defined (__AVX2__)
 // imported from simd_iv.h
 uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
                           0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
                           0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
                           0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
                           0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
                           0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
                           0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
                           0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
 /* Twiddle tables */
 static const m256_v16 FFT64_Twiddle[] =
 {
    {{ 1,    2,    4,    8,   16,   32,   64,  128,
       1,    2,    4,    8,   16,   32,   64,  128 }},
    {{ 1,   60,    2,  120,    4,  -17,    8,  -34,
       1,   60,    2,  120,    4,  -17,    8,  -34 }},
    {{ 1,  120,    8,  -68,   64,  -30,   -2,   17,
       1,  120,    8,  -68,   64,  -30,   -2,   17 }},
    {{ 1,   46,   60,  -67,    2,   92,  120,  123,
       1,   46,   60,  -67,    2,   92,  120,  123 }},
    {{ 1,   92,  -17,  -22,   32,  117,  -30,   67,
       1,   92,  -17,  -22,   32,  117,  -30,   67 }},
    {{ 1,  -67,  120,  -73,    8,  -22,  -68,  -70,
       1,  -67,  120,  -73,    8,  -22,  -68,  -70 }},
    {{ 1,  123,  -34,  -70,  128,   67,   17,   35,
       1,  123,  -34,  -70,  128,   67,   17,   35 }},
 };
 static const m256_v16 FFT128_Twiddle[] =
 {
    {{   1, -118,   46,  -31,   60,  116,  -67,  -61,
         1, -118,   46,  -31,   60,  116,  -67,  -61 }},
    {{   2,   21,   92,  -62,  120,  -25,  123, -122,
         2,   21,   92,  -62,  120,  -25,  123, -122 }},
    {{   4,   42,  -73, -124,  -17,  -50,  -11,   13,
         4,   42,  -73, -124,  -17,  -50,  -11,   13 }},
    {{   8,   84,  111,    9,  -34, -100,  -22,   26,
         8,   84,  111,    9,  -34, -100,  -22,   26 }},
    {{  16,  -89,  -35,   18,  -68,   57,  -44,   52,
        16,  -89,  -35,   18,  -68,   57,  -44,   52 }},
    {{  32,   79,  -70,   36,  121,  114,  -88,  104,
        32,   79,  -70,   36,  121,  114,  -88,  104 }},
    {{  64,  -99,  117,   72,  -15,  -29,   81,  -49,
        64,  -99,  117,   72,  -15,  -29,   81,  -49 }},
    {{ 128,   59,  -23, -113,  -30,  -58,  -95,  -98,
       128,   59,  -23, -113,  -30,  -58,  -95,  -98 }},
 };
 static const m256_v16 FFT256_Twiddle[] =
 {
    {{   1,   41, -118,   45,   46,   87,  -31,   14,
         1,   41, -118,   45,   46,   87,  -31,   14 }},
    {{  60, -110,  116, -127,  -67,   80,  -61,   69,
        60, -110,  116, -127,  -67,   80,  -61,   69 }},
    {{   2,   82,   21,   90,   92,  -83,  -62,   28,
         2,   82,   21,   90,   92,  -83,  -62,   28 }},
    {{ 120,   37,  -25,    3,  123,  -97, -122, -119,
       120,   37,  -25,    3,  123,  -97, -122, -119 }},
    {{   4,  -93,   42,  -77,  -73,   91, -124,   56,
         4,  -93,   42,  -77,  -73,   91, -124,   56 }},
    {{ -17,   74,  -50,    6,  -11,   63,   13,   19,
       -17,   74,  -50,    6,  -11,   63,   13,   19 }},
    {{   8,   71,   84,  103,  111,  -75,    9,  112,
         8,   71,   84,  103,  111,  -75,    9,  112 }},
    {{ -34, -109, -100,   12,  -22,  126,   26,   38,
       -34, -109, -100,   12,  -22,  126,   26,   38 }},
    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33,
        16, -115,  -89,  -51,  -35,  107,   18,  -33 }},
    {{ -68,   39,   57,   24,  -44,   -5,   52,   76,
       -68,   39,   57,   24,  -44,   -5,   52,   76 }},
    {{  32,   27,   79, -102,  -70,  -43,   36,  -66,
        32,   27,   79, -102,  -70,  -43,   36,  -66 }},
    {{ 121,   78,  114,   48,  -88,  -10,  104, -105,
       121,   78,  114,   48,  -88,  -10,  104, -105 }},
    {{  64,   54,  -99,   53,  117,  -86,   72,  125,
        64,   54,  -99,   53,  117,  -86,   72,  125 }},
    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47,
       -15, -101,  -29,   96,   81,  -20,  -49,   47 }},
    {{ 128,  108,   59,  106,  -23,   85, -113,   -7,
       128,  108,   59,  106,  -23,   85, -113,   -7 }},
    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94,
       -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }}
 };
 #define SHUFXOR_1 0xb1          /* 0b10110001 */
 #define SHUFXOR_2 0x4e          /* 0b01001110 */
 #define SHUFXOR_3 0x1b          /* 0b00011011 */
 #define CAT(x, y) x##y
 #define XCAT(x,y) CAT(x,y)
 #define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
 // imported from vector.c
 #define REDUCE(x) \
  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
                                         _mm256_srai_epi16( x, 8 ) )
 #define EXTRA_REDUCE_S(x)\
  _mm256_sub_epi16( x, \
         _mm256_and_si256( _mm256_set1_epi16( 257 ), \
                           _mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )
 #define DO_REDUCE( i )      X(i) = REDUCE( X(i) )
 #define DO_REDUCE_FULL_S(i) \
 do { \
    X(i) = REDUCE( X(i) );                        \
    X(i) = EXTRA_REDUCE_S( X(i) );                \
 } while(0)
 void fft64_2way( void *a )
 {
  __m256i* const A = a;
  register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
 #define X(i) X##i
  X0 = A[0];
  X1 = A[1];
  X2 = A[2];
  X3 = A[3];
  X4 = A[4];
  X5 = A[5];
  X6 = A[6];
  X7 = A[7];
 #define DO_REDUCE(i)   X(i) = REDUCE( X(i) )
   // Begin with 8 parallels DIF FFT_8
   //
   // FFT_8 using w=4 as 8th root of unity
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.
  static const int w[] = {0, 2, 4, 6};
 //   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
 #define BUTTERFLY_0( i,j ) \
 do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_sub_epi16( X(i), v ); \
 } while(0)
 #define BUTTERFLY_N( i,j,n ) \
 do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
 } while(0)
  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );
  DO_REDUCE( 2 );
  DO_REDUCE( 3 );
  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );
  DO_REDUCE( 1 );
  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );
  /* We don't need to reduce X(7) */
  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );
 #undef BUTTERFLY_0
 #undef BUTTERFLY_N
  // Multiply by twiddle factors
  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
    __m256i t1= X(i); \
    __m256i t2= X(j); \
    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
  } while(0)
  INTERLEAVE( 1, 0 );
  INTERLEAVE( 3, 2 );
  INTERLEAVE( 5, 4 );
  INTERLEAVE( 7, 6 );
  INTERLEAVE( 2, 0 );
  INTERLEAVE( 3, 1 );
  INTERLEAVE( 6, 4 );
  INTERLEAVE( 7, 5 );
  INTERLEAVE( 4, 0 );
  INTERLEAVE( 5, 1 );
  INTERLEAVE( 6, 2 );
  INTERLEAVE( 7, 3 );
 #undef INTERLEAVE
   //Finish with 8 parallels DIT FFT_8
   //FFT_8 using w=4 as 8th root of unity
   // Unrolled decimation in time (DIT) radix-2 NTT.
   // Input data is in revbin_permuted order.
 #define BUTTERFLY_0( i,j ) \
 do { \
   __m256i u = X(j); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
 } while(0)
 #define BUTTERFLY_N( i,j,n ) \
 do { \
   __m256i u = X(j); \
   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
 } while(0)
  DO_REDUCE( 0 );
  DO_REDUCE( 1 );
  DO_REDUCE( 2 );
  DO_REDUCE( 3 );
  DO_REDUCE( 4 );
  DO_REDUCE( 5 );
  DO_REDUCE( 6 );
  DO_REDUCE( 7 );
  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );
  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );
  DO_REDUCE( 3 );
  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );
  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );
  DO_REDUCE_FULL_S( 7 );
 #undef BUTTERFLY
  A[0] = X0;
  A[1] = X1;
  A[2] = X2;
  A[3] = X3;
  A[4] = X4;
  A[5] = X5;
  A[6] = X6;
  A[7] = X7;
 #undef X
 }
 void fft128_2way( void *a )
 {
  int i;
  // Temp space to help for interleaving in the end
  __m256i B[8];
  __m256i *A = (__m256i*) a;
 //  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
  /* Size-2 butterflies */
  for ( i = 0; i<8; i++ )
  {
    B[ i ]   = _mm256_add_epi16( A[ i ], A[ i+8 ] );
    B[ i ]   = REDUCE_FULL_S( B[ i ] );
    A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
  }
  fft64_2way( B );
  fft64_2way( A+8 );
  /* Transpose (i.e. interleave) */
  for ( i = 0; i < 8; i++ )
  {
    A[ 2*i   ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
    A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
  }
 }
 void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
 {
  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
  __m256i *X = (__m256i*)x;
  __m256i *A = (__m256i*)a;
 //  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
 #define UNPACK( i ) \
 do { \
    __m256i t = X[i]; \
    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
    A[2*i+8] = REDUCE(A[2*i+8]); \
    A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
    A[2*i+9] = REDUCE(A[2*i+9]); \
 } while(0)
    // This allows to tweak the last butterflies to introduce X^127
 #define UNPACK_TWEAK( i,tw ) \
 do { \
    __m256i t = X[i]; \
    __m256i tmp; \
    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
    A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
    tmp      = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
    A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
                                   FFT128_Twiddle[ 2*i+1 ].m256i );\
    A[2*i+9] = REDUCE( A[ 2*i+9 ] );                       \
 } while(0)
  UNPACK( 0 );
  UNPACK( 1 );
  UNPACK( 2 );
  if ( final )
    UNPACK_TWEAK( 3, FinalTweak.m256i );
  else
    UNPACK_TWEAK( 3, Tweak.m256i );
 #undef UNPACK
 #undef UNPACK_TWEAK
  fft64_2way( a );
  fft64_2way( a+128 );
 }
 void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
 {
  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
  __m256i *X = (__m256i*)x;
  __m256i *A = (__m256i*)a;
 //  __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
 #define UNPACK( i ) \
 do { \
    __m256i t = X[i]; \
    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
                                        FFT256_Twiddle[ 2*i ].m256i ); \
    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
    A[ 2*i +  1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
    A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
 } while(0)
   // This allows to tweak the last butterflies to introduce X^127
 #define UNPACK_TWEAK( i,tw ) \
 do { \
    __m256i t = X[i]; \
    __m256i tmp; \
    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
                                        FFT256_Twiddle[ 2*i ].m256i ); \
    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
    tmp           = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[ 2*i +  1 ] = _mm256_add_epi16( tmp, tw ); \
    A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
  } while(0)
  UNPACK( 0 );
  UNPACK( 1 );
  UNPACK( 2 );
  UNPACK( 3 );
  UNPACK( 4 );
  UNPACK( 5 );
  UNPACK( 6 );
  if ( final )
    UNPACK_TWEAK( 7, FinalTweak.m256i );
  else
    UNPACK_TWEAK( 7, Tweak.m256i );
 #undef UNPACK
 #undef UNPACK_TWEAK
  fft128_2way( a );
  fft128_2way( a+256 );
 }
 void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
 {
  register __m256i S0l, S1l, S2l, S3l;
  register __m256i S0h, S1h, S2h, S3h;
  __m256i *S = (__m256i*) state;
  __m256i *M = (__m256i*) msg;
  __m256i *W = (__m256i*) fft;
  static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
  S0l = _mm256_xor_si256( S[0], M[0] );
  S0h = _mm256_xor_si256( S[1], M[1] );
  S1l = _mm256_xor_si256( S[2], M[2] );
  S1h = _mm256_xor_si256( S[3], M[3] );
  S2l = _mm256_xor_si256( S[4], M[4] );
  S2h = _mm256_xor_si256( S[5], M[5] );
  S3l = _mm256_xor_si256( S[6], M[6] );
  S3h = _mm256_xor_si256( S[7], M[7] );
 #define S(i) S##i
 #define F_0(B, C, D) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
 #define F_1(B, C, D) \
   _mm256_or_si256( _mm256_and_si256( D, C ),\
                    _mm256_and_si256( _mm256_or_si256( D,C ), B ) )
 #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
 #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
  // We split the round function in two halfes
  // so as to insert some independent computations in between
 #define SUM7_00 0
 #define SUM7_01 1
 #define SUM7_02 2
 #define SUM7_03 3
 #define SUM7_04 4
 #define SUM7_05 5
 #define SUM7_06 6
 #define SUM7_10 1
 #define SUM7_11 2
 #define SUM7_12 3
 #define SUM7_13 4
 #define SUM7_14 5
 #define SUM7_15 6
 #define SUM7_16 0
 #define SUM7_20 2
 #define SUM7_21 3
 #define SUM7_22 4
 #define SUM7_23 5
 #define SUM7_24 6
 #define SUM7_25 0
 #define SUM7_26 1
 #define SUM7_30 3
 #define SUM7_31 4
 #define SUM7_32 5
 #define SUM7_33 6
 #define SUM7_34 0
 #define SUM7_35 1
 #define SUM7_36 2
 #define SUM7_40 4
 #define SUM7_41 5
 #define SUM7_42 6
 #define SUM7_43 0
 #define SUM7_44 1
 #define SUM7_45 2
 #define SUM7_46 3
 #define SUM7_50 5
 #define SUM7_51 6
 #define SUM7_52 0
 #define SUM7_53 1
 #define SUM7_54 2
 #define SUM7_55 3
 #define SUM7_56 4
 #define SUM7_60 6
 #define SUM7_61 0
 #define SUM7_62 1
 #define SUM7_63 2
 #define SUM7_64 3
 #define SUM7_65 4
 #define SUM7_66 5
 #define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
 #define PERM_0(d,a) /* XOR 1 */ \
 do { \
    d##l = shufxor( a##l, 1 ); \
    d##h = shufxor( a##h, 1 ); \
 } while(0)
 #define PERM_1(d,a) /* XOR 6 */ \
 do { \
    d##l = shufxor( a##h, 2 ); \
    d##h = shufxor( a##l, 2 ); \
 } while(0)
 #define PERM_2(d,a) /* XOR 2 */ \
 do { \
    d##l = shufxor( a##l, 2 ); \
    d##h = shufxor( a##h, 2 ); \
 } while(0)
 #define PERM_3(d,a) /* XOR 3 */ \
 do { \
    d##l = shufxor( a##l, 3 ); \
    d##h = shufxor( a##h, 3 ); \
 } while(0)
 #define PERM_4(d,a) /* XOR 5 */ \
 do { \
    d##l = shufxor( a##h, 1 ); \
    d##h = shufxor( a##l, 1 ); \
 } while(0)
 #define PERM_5(d,a) /* XOR 7 */ \
 do { \
    d##l = shufxor( a##h, 3 ); \
    d##h = shufxor( a##l, 3 ); \
 } while(0)
 #define PERM_6(d,a) /* XOR 4 */ \
 do { \
    d##l = a##h; \
    d##h = a##l; \
 } while(0)
 #define STEP_1_(a,b,c,d,w,fun,r,s,z) \
 do { \
    TTl  = Fl( a,b,c,fun ); \
    TTh  = Fh( a,b,c,fun ); \
    a##l = mm256_rotl_32( a##l, r ); \
    a##h = mm256_rotl_32( a##h, r ); \
    w##l = _mm256_add_epi32( w##l, d##l ); \
    w##h = _mm256_add_epi32( w##h, d##h ); \
    TTl  = _mm256_add_epi32( TTl, w##l ); \
    TTh  = _mm256_add_epi32( TTh, w##h ); \
    TTl  = mm256_rotl_32( TTl, s ); \
    TTh  = mm256_rotl_32( TTh, s ); \
    PERM( z,d,a ); \
 } while(0)
 #define STEP_1( a,b,c,d,w,fun,r,s,z )   STEP_1_( a,b,c,d,w,fun,r,s,z )
 #define STEP_2_( a,b,c,d,w,fun,r,s ) \
 do { \
    d##l = _mm256_add_epi32( d##l, TTl ); \
    d##h = _mm256_add_epi32( d##h, TTh ); \
 } while(0)
 #define STEP_2( a,b,c,d,w,fun,r,s )  STEP_2_( a,b,c,d,w,fun,r,s )
 #define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
 do { \
    register __m256i TTl, TTh, Wl=w1, Wh=w2; \
    STEP_1( a,b,c,d,W,fun,r,s,z ); \
    STEP_2( a,b,c,d,W,fun,r,s ); \
 } while(0);
 #define MSG_l(x) (2*(x))
 #define MSG_h(x) (2*(x)+1)
 #define MSG( w,hh,ll,u,z ) \
 do { \
    int a = MSG_##u(hh); \
    int b = MSG_##u(ll); \
    w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
    w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
    w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
    w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
 } while(0)
 #define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
 do { \
    register __m256i W0l, W1l, W2l, W3l, TTl; \
    register __m256i W0h, W1h, W2h, W3h, TTh; \
    MSG( W0, h0, l0, u0, z ); \
    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
    MSG( W1, h1, l1, u1, z ); \
    STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
    MSG( W2,h2,l2,u2,z ); \
    STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
    MSG( W3,h3,l3,u3,z ); \
    STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
    STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
 } while(0)
   // 4 rounds with code 185
 #define PERM_START 0
   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 4
   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 1
   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
 #undef PERM_START
 #define PERM_START 5
   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
 #undef PERM_START
   // 4 rounds with code 233
 #define PERM_START 2
   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 6
   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 3
   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
 #undef PERM_START
 #define PERM_START 0
   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
 #undef PERM_START
   // 1 round as feed-forward
 #define PERM_START 4
   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0 );
   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3 );
   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
 #undef PERM_START
 #undef STEP_1
 #undef STEP_2
 #undef STEP
 #undef ROUND
 }
 void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
 {
   m256_v16 Y[32];
   uint16_t *y = (uint16_t*) Y[0].u16;
   fft256_2way_msg( y, m, final );
   rounds512_2way( state->A, m, y );
 }
 // imported from nist.c
 int simd_2way_init( simd_2way_context *state, int hashbitlen )
 {
  __m256i *A = (__m256i*)state->A;
  int n = 8;
  state->hashbitlen = hashbitlen;
  state->n_feistels = n;
  state->blocksize = 128*8;
  state->count = 0;
  for ( int i = 0; i < 8; i++ )
       A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
                                SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
  return 0;
 }
 int simd_2way_update( simd_2way_context *state, const void *data,
                             int databitlen )
 {
  int bs      = state->blocksize;
  int current = state->count & (bs - 1);
  while ( databitlen > 0 )
  {
    if ( current == 0 && databitlen >= bs )
    {
       // We can hash the data directly from the input buffer.
      SIMD_2way_Compress( state, data, 0 );
      databitlen -= bs;
      data += 2*(bs/8);
      state->count += bs;
    }
    else
    {
       // Copy a chunk of data to the buffer
      int len = bs - current;
      if ( databitlen < len )
      {
        memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
        state->count += databitlen;
        return 0;
      }
      else
      {
        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
        state->count += len;
        databitlen -= len;
        data += 2*(len/8);
        current = 0;
        SIMD_2way_Compress( state, state->buffer, 0 );
      }
    }
  }
  return 0;
 }
 int simd_2way_close( simd_2way_context *state, void *hashval )
 {
  uint64_t l;
  int current = state->count & (state->blocksize - 1);
  int i;
  int isshort = 1;
  // If there is still some data in the buffer, hash it
  if ( current )
  {
    current = ( current+7 ) / 8;
    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
    SIMD_2way_Compress( state, state->buffer, 0 );
  }
  //* Input the message length as the last block
  memset( state->buffer, 0, 2*(state->blocksize / 8) );
  l = state->count;
  for ( i = 0; i < 8; i++ )
  {
    state->buffer[ i     ] = l & 0xff;
    state->buffer[ i+16 ] = l & 0xff;
    l >>= 8;
  }
  if ( state->count < 16384 )
    isshort = 2;
  SIMD_2way_Compress( state, state->buffer, isshort );
  memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
  return 0;
 }
 int simd_2way_update_close( simd_2way_context *state, void *hashval,
                            const void *data, int databitlen )
 {
  int current, i;
  int bs = state->blocksize;  // bits in one lane
  int isshort = 1;
  uint64_t l;
  current = state->count & (bs - 1);
  while ( databitlen > 0 )
  {
    if ( current == 0 && databitlen >= bs )
    {
      // We can hash the data directly from the input buffer.
      SIMD_2way_Compress( state, data, 0 );
      databitlen -= bs;
      data += 2*( bs/8 );
      state->count += bs;
    }
    else
    {
      // Copy a chunk of data to the buffer
      int len = bs - current;
      if ( databitlen < len )
      {
        memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
        state->count += databitlen;
        break;
      }
      else
      {
        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
        state->count += len;
        databitlen -= len;
        data += 2*( len/8 );
        current = 0;
        SIMD_2way_Compress( state, state->buffer, 0 );
      }
    }
  }
  current = state->count & (state->blocksize - 1);
  // If there is still some data in the buffer, hash it
  if ( current )
  {
    current = ( current+7 ) / 8;
    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
    SIMD_2way_Compress( state, state->buffer, 0 );
  }
  //* Input the message length as the last block
  memset( state->buffer, 0, 2*( state->blocksize/8 ) );
  l = state->count;
  for ( i = 0; i < 8; i++ )
  {
    state->buffer[ i    ] = l & 0xff;
    state->buffer[ i+16 ] = l & 0xff;
    l >>= 8;
  }
  if ( state->count < 16384 )
    isshort = 2;
  SIMD_2way_Compress( state, state->buffer, isshort );
  memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
  return 0;
 }
 #endif
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -0,0 +1,27 @@
 #ifndef SIMD_HASH_2WAY_H__
 #define SIMD_HASH_2WAY_H__ 1
 #include "simd-compat.h"
 #if defined(__AVX2__)
 #include "avxdefs.h"
 typedef struct {
  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
  uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
  uint64_t count;
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
 } simd_2way_context;
 int simd_2way_init( simd_2way_context *state, int hashbitlen );
 int simd_2way_update( simd_2way_context *state, const void *data,
                      int databitlen );
 int simd_2way_close( simd_2way_context *state, void *hashval );
 int simd_2way_update_close( simd_2way_context *state, void *hashval,
                            const void *data, int databitlen );
 #endif
 #endif
--- a/algo/simd/sse2/simd_iv.h
+++ b/algo/simd/sse2/simd_iv.h
@@ -1,3 +1,6 @@
 #if !defined(SIMD_IV_H__)
 #define SIMD_IV_H__
 u32 IV_224[] = {
  0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
  0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
@@ -25,3 +28,5 @@ u32 IV_512[] = {
  0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
  0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
 };
 #endif
--- a/algo/simd/sse2/defs_x5.h
+++ b/algo/simd/sse2/defs_x5.h
@@ -1,23 +0,0 @@
 #ifndef DEFS_X5_H__
 #define DEFS_X5_H__
 #include <emmintrin.h>
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
 typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
 typedef unsigned char uint8;
 typedef unsigned int uint32;
 typedef unsigned long long uint64;
 typedef struct {
    uint32 buffer[8]; /* Buffer to be hashed */
    __m128i chainv[10];   /* Chaining values */
    uint64 bitlen[2]; /* Message length in bits */
    uint32 rembitlen; /* Length of buffer data to be hashed */
    int hashbitlen;
 } hashState_luffa;
 typedef unsigned char byte;
 #endif
--- a/algo/simd/sse2/sph_types.h
+++ b/algo/simd/sse2/sph_types.h
--- a/algo/simd/sse2/vector.c
+++ b/algo/simd/sse2/vector.c
@@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) {
  v16* const A = a;
  register v16 X0, X1, X2, X3, X4, X5, X6, X7;
-
+/*
 #if V16_SIZE == 8
 #define X(i) A[i]
 #elif V16_SIZE == 4
 #define X(i) A[2*i]
 #endif
-
+*/
 #define X(i) X##i
  X0 = A[0];
@@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) {
  STEP(S(1), S(2), S(3), S(0), S[3], 0, 25,  4, 20);
  S[0] = S(0);  S[1] = S(1);  S[2] = S(2);  S[3] = S(3);
 #undef ROUND
 #undef STEP
 #undef STEP_1
 #undef STEP_2
 }
@@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
   */
 #define PERM_START 0
  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 4
  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 1
  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
 #undef PERM_START
 #define PERM_START 5
  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
 #undef PERM_START
  /*
   * 4 rounds with code 233
   */
 #define PERM_START 2
  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 6
  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 3
  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
 #undef PERM_START
 #define PERM_START 0
  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
 #undef PERM_START
  /*
@@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
  STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
  STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
  STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3);
 #undef PERM_START
  S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
  S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
 #undef ROUND
 #undef STEP
 #undef STEP_1
 #undef STEP_2
 }
 void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
--- a/algo/simd/sse2/vector.h
+++ b/algo/simd/sse2/vector.h
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -125,14 +125,14 @@ void sm3_4way_close( void *cc, void *dst )
      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }
-   count[0] = mm_byteswap_32(
+   count[0] = mm_bswap_32(
                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+   count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
                                              ( ctx->num     << 3 ) ) );
   sm3_4way_compress( ctx->digest, block );
   for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm_byteswap_32( ctx->digest[i] );
+     hash[i] = mm_bswap_32( ctx->digest[i] );
 }
 #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x,  9 ), \
@@ -165,7 +165,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   int j;
   for ( j = 0; j < 16; j++ )
-      W[j] = mm_byteswap_32( block[j] );
+      W[j] = mm_bswap_32( block[j] );
   for ( j = 16; j < 68; j++ )
      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
--- a/algo/whirlpool/md-helper-4way.c
+++ b/algo/whirlpool/md-helper-4way.c
@@ -229,18 +229,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
 #if defined BE64
 #if defined PLW1
    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #elif defined PLW4
    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #else
    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #endif  // PLW
 #else  // LE64
 #if defined PLW1
@@ -276,7 +276,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
    for ( u = 0; u < rnum; u ++ )
    {
 #if defined BE64
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 #else  // LE64
       ((__m256i*)dst)[u] = sc->val[u];
 #endif
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 typedef struct {
@@ -25,10 +25,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;    
    keccak512_4way_context  keccak;    
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
 } c11_4way_ctx_holder;
@@ -42,10 +42,10 @@ void init_c11_4way_ctx()
     skein512_4way_init( &c11_4way_ctx.skein );
     jh512_4way_init( &c11_4way_ctx.jh );
     keccak512_4way_init( &c11_4way_ctx.keccak );
-     init_luffa( &c11_4way_ctx.luffa, 512 );
+     luffa_2way_init( &c11_4way_ctx.luffa, 512 );
     cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &c11_4way_ctx.shavite );
-     init_sd( &c11_4way_ctx.simd, 512 );
+     simd_2way_init( &c11_4way_ctx.simd, 512 );
     init_echo( &c11_4way_ctx.echo, 512 );
 }
@@ -56,6 +56,7 @@ void c11_4way_hash( void *state, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
     c11_4way_ctx_holder ctx;
     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
@@ -98,17 +99,13 @@ void c11_4way_hash( void *state, const void *input )
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
-     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     luffa_2way_init( &ctx.luffa, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
-     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +133,13 @@ void c11_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
-     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     simd_2way_init( &ctx.simd, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -22,9 +22,9 @@
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -12,7 +12,7 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 static __thread uint32_t s_ntime = UINT32_MAX;
@@ -25,7 +25,7 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
 } tt8_4way_ctx_holder;
@@ -39,7 +39,7 @@ void init_tt8_4way_ctx()
    skein512_4way_init( &tt8_4way_ctx.skein );
    jh512_4way_init( &tt8_4way_ctx.jh );
    keccak512_4way_init( &tt8_4way_ctx.keccak );
-    init_luffa( &tt8_4way_ctx.luffa, 512 );
+    luffa_2way_init( &tt8_4way_ctx.luffa, 512 );
    cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
 };
@@ -139,17 +139,13 @@ void timetravel_4way_hash(void *output, const void *input)
        case 6:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+           mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
-                                        (const BitSequence *)hash0, dataLen );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+           mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
-                                         (const BitSequence*)hash1, dataLen );
+           luffa_2way_init( &ctx.luffa, 512 );
-           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+           mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
                                         (const BitSequence*)hash2, dataLen );
           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                         (const BitSequence*)hash3, dataLen );
           if ( i != 7 )           
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -9,7 +9,7 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -27,10 +27,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
 } tt10_4way_ctx_holder;
 tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
@@ -43,10 +43,10 @@ void init_tt10_4way_ctx()
    skein512_4way_init( &tt10_4way_ctx.skein );
    jh512_4way_init( &tt10_4way_ctx.jh );
    keccak512_4way_init( &tt10_4way_ctx.keccak );
-    init_luffa( &tt10_4way_ctx.luffa, 512 );
+    luffa_2way_init( &tt10_4way_ctx.luffa, 512 );
    cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
    sph_shavite512_init( &tt10_4way_ctx.shavite );
-    init_sd( &tt10_4way_ctx.simd, 512 );
+    simd_2way_init( &tt10_4way_ctx.simd, 512 );
 };
 void timetravel10_4way_hash(void *output, const void *input)
@@ -145,17 +145,13 @@ void timetravel10_4way_hash(void *output, const void *input)
        case 6:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+           mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
-                                        (const BitSequence *)hash0, dataLen );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+           mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
-                                         (const BitSequence*)hash1, dataLen );
+           luffa_2way_init( &ctx.luffa, 512 );
-           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+           mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
                                         (const BitSequence*)hash2, dataLen );
           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                         (const BitSequence*)hash3, dataLen );
           if ( i != 9 )           
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -199,17 +195,13 @@ void timetravel10_4way_hash(void *output, const void *input)
        case 9:
           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
-           update_final_sd( &ctx.simd, (BitSequence *)hash0,
+           mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
-                            (const BitSequence *)hash0, dataLen<<3 );
+           simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
-           update_final_sd( &ctx.simd, (BitSequence *)hash1,
+           mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
-                            (const BitSequence *)hash1, dataLen<<3 );
+           simd_2way_init( &ctx.simd, 512 );
-           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
-           update_final_sd( &ctx.simd, (BitSequence *)hash2,
+           mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
                            (const BitSequence *)hash2, dataLen<<3 );
           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
           update_final_sd( &ctx.simd, (BitSequence *)hash3,
                            (const BitSequence *)hash3, dataLen<<3 );
           if ( i != 9 )
              mm256_interleave_4x64( vhashB,
                                     hash0, hash1, hash2, hash3, dataLen<<3 );
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -8,10 +8,10 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -5,17 +5,16 @@
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 typedef struct {
@@ -25,10 +24,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;    
    keccak512_4way_context  keccak;    
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
 } x11_4way_ctx_holder;
@@ -42,10 +41,10 @@ void init_x11_4way_ctx()
     skein512_4way_init( &x11_4way_ctx.skein );
     jh512_4way_init( &x11_4way_ctx.jh );
     keccak512_4way_init( &x11_4way_ctx.keccak );
-     init_luffa( &x11_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x11_4way_ctx.luffa, 512 );
     cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x11_4way_ctx.shavite );
-     init_sd( &x11_4way_ctx.simd, 512 );
+     simd_2way_init( &x11_4way_ctx.simd, 512 );
     init_echo( &x11_4way_ctx.echo, 512 );
 }
@@ -56,6 +55,8 @@ void x11_4way_hash( void *state, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
     x11_4way_ctx_holder ctx;
     memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
@@ -94,21 +95,16 @@ void x11_4way_hash( void *state, const void *input )
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-     // 7 Luffa
+     // 7 Luffa parallel 2 way 128 bit
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
-     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     luffa_2way_init( &ctx.luffa, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
-     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +132,13 @@ void x11_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
-     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     simd_2way_init( &ctx.simd, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
-     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -10,10 +10,8 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 #ifndef NO_AES_NI
@@ -21,9 +19,9 @@
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"  
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -11,15 +11,12 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sph_simd.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/echo/aes_ni/hash_api.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 typedef struct {
    blake512_4way_context   blake;
@@ -28,10 +25,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
 } x11evo_4way_ctx_holder;
@@ -45,10 +42,11 @@ void init_x11evo_4way_ctx()
     skein512_4way_init( &x11evo_4way_ctx.skein );
     jh512_4way_init( &x11evo_4way_ctx.jh );
     keccak512_4way_init( &x11evo_4way_ctx.keccak );
     luffa_2way_init( &x11evo_4way_ctx.luffa, 512 );
     init_luffa( &x11evo_4way_ctx.luffa, 512 );
     cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x11evo_4way_ctx.shavite );
-     init_sd( &x11evo_4way_ctx.simd, 512 );
+     simd_2way_init( &x11evo_4way_ctx.simd, 512 );
     init_echo( &x11evo_4way_ctx.echo, 512 );
 }
@@ -142,20 +140,13 @@ void x11evo_4way_hash( void *state, const void *input )
         case 6:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+            mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
-                                          (const BitSequence*)hash0, 64 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
-                    sizeof(hashState_luffa) );
+            mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+            luffa_2way_init( &ctx.luffa, 512 );
-                                          (const BitSequence*)hash1, 64 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
                    sizeof(hashState_luffa) );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                                          (const BitSequence*)hash2, 64 );
            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
                    sizeof(hashState_luffa) );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                          (const BitSequence*)hash3, 64 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
@@ -202,17 +193,13 @@ void x11evo_4way_hash( void *state, const void *input )
         case 9:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhash, 64<<3 );
-            update_final_sd( &ctx.simd, (BitSequence *)hash0,
+            mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
-                                  (const BitSequence *)hash0, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
-            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
-            update_final_sd( &ctx.simd, (BitSequence *)hash1,
+            mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
-                                  (const BitSequence *)hash1, 512 );
+            simd_2way_init( &ctx.simd, 512 );
-            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
-            update_final_sd( &ctx.simd, (BitSequence *)hash2,
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
                                  (const BitSequence *)hash2, 512 );
            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
            update_final_sd( &ctx.simd, (BitSequence *)hash3,
                                  (const BitSequence *)hash3, 512 );
            if ( i < len-1 )
               mm256_interleave_4x64( vhash,
                                      hash0, hash1, hash2, hash3, 64<<3 );
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -22,9 +22,9 @@
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 typedef struct {
 #ifdef NO_AES_NI
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -13,10 +13,10 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/gost/sph_gost.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 typedef struct {
@@ -27,10 +27,10 @@ typedef struct {
    jh512_4way_context      jh;    
    keccak512_4way_context  keccak;    
    sph_gost512_context     gost;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
 } x11gost_4way_ctx_holder;
@@ -45,10 +45,10 @@ void init_x11gost_4way_ctx()
     jh512_4way_init( &x11gost_4way_ctx.jh );
     keccak512_4way_init( &x11gost_4way_ctx.keccak );
     sph_gost512_init( &x11gost_4way_ctx.gost );
-     init_luffa( &x11gost_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x11gost_4way_ctx.luffa, 512 );
     cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x11gost_4way_ctx.shavite );
-     init_sd( &x11gost_4way_ctx.simd, 512 );
+     simd_2way_init( &x11gost_4way_ctx.simd, 512 );
     init_echo( &x11gost_4way_ctx.echo, 512 );
 }
@@ -59,6 +59,7 @@ void x11gost_4way_hash( void *state, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x11gost_4way_ctx_holder ctx;
     memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
@@ -109,17 +110,13 @@ void x11gost_4way_hash( void *state, const void *input )
     sph_gost512( &ctx.gost, hash3, 64 );
     sph_gost512_close( &ctx.gost, hash3 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
-     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
@@ -144,17 +141,12 @@ void x11gost_4way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
     update_final_sd( &ctx.simd, (BitSequence *)hash2,
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -10,9 +10,9 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/sph_echo.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -27,10 +27,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
@@ -46,10 +46,10 @@ void init_x13_4way_ctx()
     skein512_4way_init( &x13_4way_ctx.skein );
     jh512_4way_init( &x13_4way_ctx.jh );
     keccak512_4way_init( &x13_4way_ctx.keccak );
-     init_luffa( &x13_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x13_4way_ctx.luffa, 512 );
     cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x13_4way_ctx.shavite );
-     init_sd( &x13_4way_ctx.simd, 512 );
+     simd_2way_init( &x13_4way_ctx.simd, 512 );
     init_echo( &x13_4way_ctx.echo, 512 );
     hamsi512_4way_init( &x13_4way_ctx.hamsi );
     sph_fugue512_init( &x13_4way_ctx.fugue );
@@ -104,17 +104,13 @@ void x13_4way_hash( void *state, const void *input )
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
-     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,17 +138,13 @@ void x13_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_init( &ctx.simd, 512 );
-     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -168,10 +160,10 @@ void x13_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -19,9 +19,9 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"   
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/sm3/sm3-hash-4way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -28,10 +28,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
    sm3_4way_ctx_t          sm3;
    hamsi512_4way_context   hamsi;
@@ -49,10 +49,10 @@ void init_x13sm3_4way_ctx()
     skein512_4way_init( &x13sm3_4way_ctx.skein );
     jh512_4way_init( &x13sm3_4way_ctx.jh );
     keccak512_4way_init( &x13sm3_4way_ctx.keccak );
-     init_luffa( &x13sm3_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x13sm3_4way_ctx.luffa, 512 );
     cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x13sm3_4way_ctx.shavite );
-     init_sd( &x13sm3_4way_ctx.simd, 512 );
+     simd_2way_init( &x13sm3_4way_ctx.simd, 512 );
     init_echo( &x13sm3_4way_ctx.echo, 512 );
     sm3_4way_init( &x13sm3_4way_ctx.sm3 );
     hamsi512_4way_init( &x13sm3_4way_ctx.hamsi );
@@ -111,17 +111,13 @@ void x13sm3_4way_hash( void *state, const void *input )
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
-     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     // Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -149,17 +145,13 @@ void x13sm3_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_init( &ctx.simd, 512 );
-     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     // Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -190,12 +182,13 @@ void x13sm3_4way_hash( void *state, const void *input )
     sm3_4way( &ctx.sm3, vhash, 64 );
     sm3_4way_close( &ctx.sm3, sm3_vhash );
     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
-     // Hamsi parallel 32 bit
+     // Hamsi parallel 4x32x2
-     hamsi512_4way( &ctx.hamsi, sm3_vhash, 64 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     // Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -15,9 +15,9 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/sm3/sph_sm3.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sse2/sph_echo.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -9,8 +9,7 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/fugue//sph_fugue.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 //#include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -18,7 +17,7 @@ typedef struct {
   skein512_4way_context   skein;
   shabal512_4way_context  shabal;
   hashState_echo          echo;
-   hashState_luffa         luffa;
+   luffa_2way_context      luffa;
   sph_fugue512_context    fugue;
   sph_gost512_context     gost;
 } poly_4way_ctx_holder;
@@ -27,12 +26,12 @@ poly_4way_ctx_holder poly_4way_ctx;
 void init_polytimos_4way_ctx()
 {
-   skein512_4way_init( &poly_4way_ctx.skein );
+    skein512_4way_init( &poly_4way_ctx.skein );
-   shabal512_4way_init( &poly_4way_ctx.shabal );
+    shabal512_4way_init( &poly_4way_ctx.shabal );
-   init_echo( &poly_4way_ctx.echo, 512  );
+    init_echo( &poly_4way_ctx.echo, 512  );
-   init_luffa( &poly_4way_ctx.luffa, 512 );
+    luffa_2way_init( &poly_4way_ctx.luffa, 512 );
-   sph_fugue512_init( &poly_4way_ctx.fugue );
+    sph_fugue512_init( &poly_4way_ctx.fugue );
-   sph_gost512_init( &poly_4way_ctx.gost );
+    sph_gost512_init( &poly_4way_ctx.gost );
 }
 void polytimos_4way_hash( void *output, const void *input )
@@ -67,17 +66,13 @@ void polytimos_4way_hash( void *output, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
-     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -8,7 +8,7 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue//sph_fugue.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
 #ifndef NO_AES_NI
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -29,10 +29,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
@@ -45,15 +45,14 @@ void init_x14_4way_ctx()
 {
     blake512_4way_init( &x14_4way_ctx.blake );
     bmw512_4way_init( &x14_4way_ctx.bmw );
     sph_bmw512_init( &x14_4way_ctx.bmw );
     init_groestl( &x14_4way_ctx.groestl, 64 );
     skein512_4way_init( &x14_4way_ctx.skein );
     jh512_4way_init( &x14_4way_ctx.jh );
     keccak512_4way_init( &x14_4way_ctx.keccak );
-     init_luffa( &x14_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x14_4way_ctx.luffa, 512 );
     cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x14_4way_ctx.shavite );
-     init_sd( &x14_4way_ctx.simd, 512 );
+     simd_2way_init( &x14_4way_ctx.simd, 512 );
     init_echo( &x14_4way_ctx.echo, 512 );
     hamsi512_4way_init( &x14_4way_ctx.hamsi );
     sph_fugue512_init( &x14_4way_ctx.fugue );
@@ -109,17 +108,13 @@ void x14_4way_hash( void *state, const void *input )
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
-     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -147,17 +142,13 @@ void x14_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_init( &ctx.simd, 512 );
-     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -173,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -20,9 +20,9 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sse2/sph_echo.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -12,14 +12,13 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 //#include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -31,13 +30,12 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
 //    sph_hamsi512_context    hamsi;
    sph_fugue512_context    fugue;
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
@@ -53,13 +51,12 @@ void init_x15_4way_ctx()
     skein512_4way_init( &x15_4way_ctx.skein );
     jh512_4way_init( &x15_4way_ctx.jh );
     keccak512_4way_init( &x15_4way_ctx.keccak );
-     init_luffa( &x15_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x15_4way_ctx.luffa, 512 );
     cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x15_4way_ctx.shavite );
-     init_sd( &x15_4way_ctx.simd, 512 );
+     simd_2way_init( &x15_4way_ctx.simd, 512 );
     init_echo( &x15_4way_ctx.echo, 512 );
     hamsi512_4way_init( &x15_4way_ctx.hamsi );
 //     sph_hamsi512_init( &x15_4way_ctx.hamsi );
     sph_fugue512_init( &x15_4way_ctx.fugue );
     shabal512_4way_init( &x15_4way_ctx.shabal );
     sph_whirlpool_init( &x15_4way_ctx.whirlpool );
@@ -114,17 +111,13 @@ void x15_4way_hash( void *state, const void *input )
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
-     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -152,17 +145,13 @@ void x15_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_init( &ctx.simd, 512 );
-     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,24 +167,11 @@ void x15_4way_hash( void *state, const void *input )
                       (const BitSequence *) hash3, 512 );
     // 12 Hamsi parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-/*
+
     // 12 Hamsi
     sph_hamsi512( &ctx.hamsi, hash0, 64 );
     sph_hamsi512_close( &ctx.hamsi, hash0 );
     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
     sph_hamsi512( &ctx.hamsi, hash1, 64 );
     sph_hamsi512_close( &ctx.hamsi, hash1 );
     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
     sph_hamsi512( &ctx.hamsi, hash2, 64 );
     sph_hamsi512_close( &ctx.hamsi, hash2 );
     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
     sph_hamsi512( &ctx.hamsi, hash3, 64 );
     sph_hamsi512_close( &ctx.hamsi, hash3 );
 */
     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -21,9 +21,9 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -23,9 +23,9 @@
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
 typedef struct {
--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -19,9 +19,9 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -41,10 +41,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_4way_context  shabal;
@@ -68,6 +68,10 @@ void x16r_4way_hash( void* output, const void* input )
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
 //   uint32_t inp0[24] __attribute__ ((aligned (64)));
 //   uint32_t inp1[24] __attribute__ ((aligned (64)));
 //   uint32_t inp2[24] __attribute__ ((aligned (64)));
 //   uint32_t inp3[24] __attribute__ ((aligned (64)));
   x16r_4way_ctx_holder ctx;
@@ -75,7 +79,6 @@ void x16r_4way_hash( void* output, const void* input )
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   int size = 80;
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
@@ -111,7 +114,7 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -123,7 +126,7 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -149,7 +152,7 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -161,7 +164,7 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -173,21 +176,17 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case LUFFA:
-            init_luffa( &ctx.luffa, 512 );
+            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+            luffa_2way_init( &ctx.luffa, 512 );
-                                          (const BitSequence*)in0, size );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            init_luffa( &ctx.luffa, 512 );
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
-                                          (const BitSequence*)in1, size );
+            luffa_2way_init( &ctx.luffa, 512 );
-            init_luffa( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                                          (const BitSequence*)in2, size );
            init_luffa( &ctx.luffa, 512 );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                                          (const BitSequence*)in3, size );
         break;
         case CUBEHASH:
            cubehashReinit( &ctx.cube );
@@ -218,18 +217,14 @@ void x16r_4way_hash( void* output, const void* input )
            sph_shavite512_close( &ctx.shavite, hash3 );
         break;
         case SIMD:
-             init_sd( &ctx.simd, 512 );
+            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash0,
+            simd_2way_init( &ctx.simd, 512 );
-                              (const BitSequence*)in0, size<<3 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-             init_sd( &ctx.simd, 512 );
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash1,
+            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
-                              (const BitSequence*)in1, size<<3 );
+            simd_2way_init( &ctx.simd, 512 );
-             init_sd( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash2,
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                              (const BitSequence*)in2, size<<3 );
             init_sd( &ctx.simd, 512 );
             update_final_sd( &ctx.simd, (BitSequence *)hash3,
                              (const BitSequence*)in3, size<<3 );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -246,11 +241,11 @@ void x16r_4way_hash( void* output, const void* input )
                                (const BitSequence*)in3, size<<3 );
         break;
         case HAMSI:
-             mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 );
+             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -271,7 +266,7 @@ void x16r_4way_hash( void* output, const void* input )
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 );
+             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -292,9 +287,13 @@ void x16r_4way_hash( void* output, const void* input )
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
 //      in0 = (void*) hash0;
 //      in1 = (void*) hash1;
 //      in2 = (void*) hash2;
 //      in3 = (void*) hash3;
      size = 64;
   }
   memcpy( output,    hash0, 32 );
@@ -351,28 +350,28 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
      x16r_4way_hash( hash, vdata );
      pdata[19] = n;
-      if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
         found[0] = true;
         num_found++;
         nonces[0] = n;
         work_set_target_ratio( work, hash );
      }
-      if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
         found[1] = true;
         num_found++;
         nonces[1] = n+1;
         work_set_target_ratio( work, hash+8 );
      }
-      if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
         found[2] = true;
         num_found++;
         nonces[2] = n+2;
         work_set_target_ratio( work, hash+16 );
      }
-      if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
         found[3] = true;
         num_found++;
--- a/algo/x17/x16r.c
+++ b/algo/x17/x16r.c
@@ -16,9 +16,9 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -117,7 +117,7 @@ void x16r_hash( void* output, const void* input )
         case GROESTL:
 #ifdef NO_AES_NI
            sph_groestl512_init( &ctx.groestl );
-            sph_groestl512( &ctx.groestl, in, size<<3 );
+            sph_groestl512( &ctx.groestl, in, size );
            sph_groestl512_close(&ctx.groestl, hash);
 #else
            init_groestl( &ctx.groestl, 64 );
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
    cubehashParam           cube;
    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    sph_fugue512_context    fugue;
@@ -54,10 +54,10 @@ void init_x17_4way_ctx()
     skein512_4way_init( &x17_4way_ctx.skein );
     jh512_4way_init( &x17_4way_ctx.jh );
     keccak512_4way_init( &x17_4way_ctx.keccak );
-     init_luffa( &x17_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x17_4way_ctx.luffa, 512 );
     cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x17_4way_ctx.shavite );
-     init_sd( &x17_4way_ctx.simd, 512 );
+     simd_2way_init( &x17_4way_ctx.simd, 512 );
     init_echo( &x17_4way_ctx.echo, 512 );
     hamsi512_4way_init( &x17_4way_ctx.hamsi );
     sph_fugue512_init( &x17_4way_ctx.fugue );
@@ -114,18 +114,14 @@ void x17_4way_hash( void *state, const void *input )
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-     // 7 Luffa serial
+     // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                             (const BitSequence*)hash0, 64 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                             (const BitSequence*)hash1, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
-     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                             (const BitSequence*)hash2, 64 );
     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 64 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -153,17 +149,13 @@ void x17_4way_hash( void *state, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
-                      (const BitSequence *)hash0, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
-                      (const BitSequence *)hash1, 512 );
+     simd_2way_init( &ctx.simd, 512 );
-     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
                      (const BitSequence *)hash2, 512 );
     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, 512 );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,11 +170,11 @@ void x17_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     // 12 Hamsi parallel 4way 32 bit
+     // 12 Hamsi
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -21,9 +21,9 @@
 #include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -13,9 +13,9 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
        skein512_4way_context   skein;
        jh512_4way_context      jh;
        keccak512_4way_context  keccak;
-        hashState_luffa         luffa;
+        luffa_2way_context      luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-        hashState_sd            simd;
+        simd_2way_context       simd;
        hashState_echo          echo;
        hamsi512_4way_context   hamsi;
        sph_fugue512_context    fugue;
@@ -56,10 +56,10 @@ void init_xevan_4way_ctx()
        skein512_4way_init(&xevan_4way_ctx.skein);
        jh512_4way_init(&xevan_4way_ctx.jh);
        keccak512_4way_init(&xevan_4way_ctx.keccak);
-        init_luffa( &xevan_4way_ctx.luffa, 512 );
+        luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
        cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
        sph_shavite512_init( &xevan_4way_ctx.shavite );
-        init_sd( &xevan_4way_ctx.simd, 512 );
+        simd_2way_init( &xevan_4way_ctx.simd, 512 );
        init_echo( &xevan_4way_ctx.echo, 512 );
        hamsi512_4way_init( &xevan_4way_ctx.hamsi );
        sph_fugue512_init( &xevan_4way_ctx.fugue );
@@ -127,20 +127,14 @@ void xevan_4way_hash( void *output, const void *input )
     keccak512_4way( &ctx.keccak, vhash, dataLen );
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-                             (const BitSequence*)hash0, dataLen );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     luffa_2way_init( &ctx.luffa, 512 );
-                             (const BitSequence*)hash1, dataLen );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                             (const BitSequence*)hash2, dataLen );
     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, dataLen );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
                           dataLen );
@@ -169,17 +163,13 @@ void xevan_4way_hash( void *output, const void *input )
     sph_shavite512( &ctx.shavite, hash3, dataLen );
     sph_shavite512_close( &ctx.shavite, hash3 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-                      (const BitSequence *)hash0, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
-                      (const BitSequence *)hash1, dataLen<<3  );
+     simd_2way_init( &ctx.simd, 512 );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
                      (const BitSequence *)hash2, dataLen<<3  );
     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, dataLen<<3  );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, dataLen<<3 );
@@ -192,12 +182,11 @@ void xevan_4way_hash( void *output, const void *input )
     memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, dataLen<<3 );
-
+     // Parallel
-     // Parallel 32 bit
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
     sph_fugue512( &ctx.fugue, hash0, dataLen );
     sph_fugue512_close( &ctx.fugue, hash0 );
@@ -278,18 +267,13 @@ void xevan_4way_hash( void *output, const void *input )
     keccak512_4way_close( &ctx.keccak, vhash );
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-                             (const BitSequence*)hash0, dataLen );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+     luffa_2way_init( &ctx.luffa, 512 );
-                             (const BitSequence*)hash1, dataLen );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
                             (const BitSequence*)hash2, dataLen );
     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
                             (const BitSequence*)hash3, dataLen );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
                           dataLen );
@@ -318,17 +302,13 @@ void xevan_4way_hash( void *output, const void *input )
     sph_shavite512( &ctx.shavite, hash3, dataLen );
     sph_shavite512_close( &ctx.shavite, hash3 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
-                      (const BitSequence *)hash0, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
-                      (const BitSequence *)hash1, dataLen<<3  );
+     simd_2way_init( &ctx.simd, 512 );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
                      (const BitSequence *)hash2, dataLen<<3  );
     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
     update_final_sd( &ctx.simd, (BitSequence *)hash3,
                      (const BitSequence *)hash3, dataLen<<3  );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, dataLen<<3 );
@@ -342,10 +322,10 @@ void xevan_4way_hash( void *output, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, dataLen<<3 );
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
     sph_fugue512( &ctx.fugue, hash0, dataLen );
     sph_fugue512_close( &ctx.fugue, hash0 );
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -11,14 +11,14 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include <openssl/sha.h>
 #ifdef NO_AES_NI
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -424,12 +424,17 @@ int64_t yescryptr16_get_max64()
  return 0xfffLL;
 }
-bool register_yescrypt_algo( algo_gate_t* gate )
+void yescrypt_gate_base(algo_gate_t *gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | AVX_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
 }
 bool register_yescrypt_algo( algo_gate_t* gate )
 {
   yescrypt_gate_base( gate );
   gate->get_max64  = (void*)&yescrypt_get_max64;
   client_key_hack = true;
   YESCRYPT_N = 2048;
@@ -440,10 +445,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
 bool register_yescryptr8_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT;
+   yescrypt_gate_base( gate );
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
   gate->get_max64  = (void*)&yescrypt_get_max64;
   client_key_hack = false;
   YESCRYPT_N = 2048;
@@ -454,10 +456,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT;
+   yescrypt_gate_base( gate );
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
   gate->get_max64  = (void*)&yescryptr16_get_max64;
   client_key_hack = false;
   YESCRYPT_N = 4096;   
--- a/avxdefs.h
+++ b/avxdefs.h
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -3,16 +3,6 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-4way.exe
 strip -s cpuminer
 mv cpuminer cpuminer-4way
 make clean
 rm -f config.status
 ./autogen.sh || echo done
 CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
--- a/build.sh
+++ b/build.sh
@@ -18,8 +18,8 @@ rm -f config.status
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
-CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make -j 4
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.0.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.1.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.8.0.1'
+PACKAGE_VERSION='3.8.1.1'
-PACKAGE_STRING='cpuminer-opt 3.8.0.1'
+PACKAGE_STRING='cpuminer-opt 3.8.1.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.8.0.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.8.1.1 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.8.0.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.8.1.1:";;
   esac
  cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.8.0.1
+cpuminer-opt configure 3.8.1.1
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.8.0.1, which was
+It was created by cpuminer-opt $as_me 3.8.1.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2981,7 +2981,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.8.0.1'
+ VERSION='3.8.1.1'
 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.8.0.1, which was
+This file was extended by cpuminer-opt $as_me 3.8.1.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.8.0.1
+cpuminer-opt config.status 3.8.1.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.8.0.1])
+AC_INIT([cpuminer-opt], [3.8.1.1])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3238,10 +3238,10 @@ int main(int argc, char *argv[])
      }
   }
-//#ifdef HAVE_SYSLOG_H
+#ifdef HAVE_SYSLOG_H
-//	if (use_syslog)
+	if (use_syslog)
-//		openlog("cpuminer", LOG_PID, LOG_USER);
+		openlog("cpuminer", LOG_PID, LOG_USER);
-//#endif
+#endif
 	work_restart = (struct work_restart*) calloc(opt_n_threads, sizeof(*work_restart));
 	if (!work_restart)
--- a/miner.h
+++ b/miner.h
@@ -80,10 +80,10 @@ void *alloca (size_t);
 # endif
 //#endif
-//#ifdef HAVE_SYSLOG_H
+#ifdef HAVE_SYSLOG_H
-//#include <syslog.h>
+#include <syslog.h>
-//#define LOG_BLUE 0x10 /* unique value */
+#define LOG_BLUE 0x10 /* unique value */
-//#else
+#else
 enum {
 	LOG_ERR,
 	LOG_WARNING,
@@ -93,7 +93,7 @@ enum {
 	/* custom notices */
 	LOG_BLUE = 0x10,
 };
-//#endif
+#endif
 static inline bool is_windows(void)
 {
Author	SHA1	Message	Date
Jay D Dee	e4265a6f11	v3.8.1.1	2018-02-09 23:30:14 -05:00
Jay D Dee	a28daca3ce	v3.8.1	2018-02-07 16:38:45 -05:00