From 0681ca996d8d864d65ea03ec77b9184bf4f820ed Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 30 Jan 2020 03:47:11 -0500
Subject: [PATCH] v3.11.8

---
 RELEASE_NOTES                               |   14 +-
 algo/cubehash/cubehash_sse2.c               |   89 +-
 algo/cubehash/cubehash_sse2.h               |    5 +-
 algo/groestl/aes_ni/groestl-asm-aes.h       | 1043 --------------
 algo/groestl/aes_ni/groestl-asm-avx.h       | 1105 ---------------
 algo/groestl/aes_ni/groestl-asm-vperm.h     | 1397 -------------------
 algo/groestl/aes_ni/groestl-intr-aes.h      |  222 +--
 algo/groestl/aes_ni/groestl-intr-avx.h      | 1072 --------------
 algo/groestl/aes_ni/groestl-intr-vperm.h    | 1294 -----------------
 algo/groestl/aes_ni/groestl-version.h       |   10 -
 algo/groestl/aes_ni/groestl256-asm-aes.h    |  529 -------
 algo/groestl/aes_ni/groestl256-asm-avx.h    |  519 -------
 algo/groestl/aes_ni/groestl256-asm-vperm.h  |  856 ------------
 algo/groestl/aes_ni/groestl256-intr-aes.h   |  121 +-
 algo/groestl/aes_ni/groestl256-intr-avx.h   |  482 -------
 algo/groestl/aes_ni/groestl256-intr-vperm.h |  793 -----------
 algo/groestl/aes_ni/hash-groestl.c          |  112 +-
 algo/groestl/aes_ni/hash-groestl256.c       |   37 +-
 algo/keccak/keccak-4way.c                   |    4 +-
 algo/keccak/keccak-gate.c                   |    2 +-
 algo/keccak/sha3d-4way.c                    |   12 +-
 algo/luffa/luffa_for_sse2.c                 |   63 +-
 algo/luffa/luffa_for_sse2.h                 |    3 +-
 algo/lyra2/allium-4way.c                    |   10 +-
 algo/lyra2/lyra2-gate.c                     |    2 +-
 algo/lyra2/lyra2-gate.h                     |    2 +-
 algo/simd/nist.c                            |  124 +-
 algo/simd/nist.h                            |    4 +-
 algo/skein/skein-hash-4way.c                |   13 +-
 algo/x11/tribus-4way.c                      |    4 +-
 algo/x16/hex.c                              |  165 ++-
 algo/x16/x16r-4way.c                        |  608 +++++---
 algo/x16/x16r-gate.c                        |    2 +
 algo/x16/x16r-gate.h                        |    2 +-
 algo/x16/x16rt-4way.c                       |  786 +++++++----
 algo/x16/x16rv2-4way.c                      | 1009 +++++++++-----
 algo/x16/x21s-4way.c                        |  786 +++++++----
 algo/x17/x17-4way.c                         |    6 +-
 algo/x17/x17.c                              |   64 +-
 algo/yespower/yescrypt-r8g.c                |    3 +-
 configure                                   |   20 +-
 configure.ac                                |    2 +-
 cpu-miner.c                                 |  102 +-
 miner.h                                     |    4 +-
 simd-utils/intrlv.h                         |   34 +-
 util.c                                      |   21 +-
 46 files changed, 2882 insertions(+), 10675 deletions(-)
 delete mode 100644 algo/groestl/aes_ni/groestl-asm-aes.h
 delete mode 100644 algo/groestl/aes_ni/groestl-asm-avx.h
 delete mode 100644 algo/groestl/aes_ni/groestl-asm-vperm.h
 delete mode 100644 algo/groestl/aes_ni/groestl-intr-avx.h
 delete mode 100644 algo/groestl/aes_ni/groestl-intr-vperm.h
 delete mode 100644 algo/groestl/aes_ni/groestl-version.h
 delete mode 100644 algo/groestl/aes_ni/groestl256-asm-aes.h
 delete mode 100644 algo/groestl/aes_ni/groestl256-asm-avx.h
 delete mode 100644 algo/groestl/aes_ni/groestl256-asm-vperm.h
 delete mode 100644 algo/groestl/aes_ni/groestl256-intr-avx.h
 delete mode 100644 algo/groestl/aes_ni/groestl256-intr-vperm.h

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index aa6938b..1a90761 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,9 +65,21 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.11.8
+
+Fixed network hashrate showing incorrect data, should be close now.
+
+Fixed compile errors when using GCC 10 with default flag -fno-common.
+
+Faster x16r, x16rv2, x16rt, x16s, x21s, veil, hex with midstate prehash.
+
+Decoupled sapling usage from block version 5 in yescryptr8g.
+
+More detailed data reporting for low difficulty rejected shares.
+
 v3.11.7
 
-Added yescryptr8g algo fotr KOTO, including support for block version 5.
+Added yescryptr8g algo for KOTO, including support for block version 5.
 
 Added sha3d algo for BSHA3.
 
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index c508248..c87829d 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -230,11 +230,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
 
     // pos is zero for 64 byte data, 1 for 80 byte data.
     sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                                    0,0,0,0, 0,0,0,0x80 ) );
+                                      m128_const_64( 0, 0x80 ) );
     transform( sp );
 
-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
     transform( sp );
     transform( sp );
     transform( sp );
@@ -276,11 +275,89 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
 
     // pos is zero for 64 byte data, 1 for 80 byte data.
     sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                                    0,0,0,0, 0,0,0,0x80 ) );
+                                      m128_const_64( 0, 0x80 ) );
     transform( sp );
 
-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->x[i];
+
+    return SUCCESS;
+}
+
+int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
+                          const byte *data, size_t size )
+{
+    __m128i *x = (__m128i*)sp->x;
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    if ( hashbitlen == 512 )
+    {
+
+       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+    }
+    else
+    {
+       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+    }
+
+
+
+
+    const int len = size / 16;
+    const __m128i* in = (__m128i*)data;
+    __m128i* hash = (__m128i*)digest;
+    int i;
+
+    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
+    // Current usage sata is either 64 or 80 bytes.
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
+                                      m128_const_64( 0, 0x80 ) );
+    transform( sp );
+
+    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
 
     transform( sp );
     transform( sp );
diff --git a/algo/cubehash/cubehash_sse2.h b/algo/cubehash/cubehash_sse2.h
index 4e1eaa3..69da618 100644
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -19,7 +19,7 @@ struct _cubehashParam
     int rounds;
     int blocksize;         // __m128i
     int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(256) x[8];  // aligned for __m256i
+    __m128i _ALIGN(64) x[8];  // aligned for __m256i
 };
 
 typedef struct _cubehashParam cubehashParam;
@@ -39,6 +39,9 @@ int cubehashDigest(cubehashParam* sp, byte *digest);
 int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
                           size_t size );
 
+int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
+                   const byte *data, size_t size );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/algo/groestl/aes_ni/groestl-asm-aes.h b/algo/groestl/aes_ni/groestl-asm-aes.h
deleted file mode 100644
index c4e44a4..0000000
--- a/algo/groestl/aes_ni/groestl-asm-aes.h
+++ /dev/null
@@ -1,1043 +0,0 @@
-/* groestl-asm-aes.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
- * instructions.
- * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl.h"
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b */
-#define MUL2(i, j, k){\
-  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
-  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
-  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* t_i = a_i + a_{i+1} */\
-  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
-  /* spill values y_4, y_5 to memory */\
-  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
-  \
-  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  /* compute w_i : add y_{i+4} */\
-  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
-  MUL2(a3, b0, b1);\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
-  MUL2(a4, b0, b1);\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /* compute v_i : double w_i      */\
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  MUL2(a3, b0, b1);\
-  MUL2(a4, b0, b1);\
-  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
-  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-}/*MixBytes*/
-
-#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-  \
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-  \
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-  \
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-  \
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-  \
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-  \
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-#endif
-
-#if (LENGTH > 256)
-
-#define SET_CONSTANTS(){\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\
-  ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\
-    ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\
-    ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
-  }\
-}while(0);
-
-#define Push_All_Regs() do{\
-  asm("push rax");\
-  asm("push rbx");\
-  asm("push rcx");\
-}while(0);
-
-#define Pop_All_Regs() do{\
-  asm("pop rcx");\
-  asm("pop rbx");\
-  asm("pop rax");\
-}while(0);
-
-/* one round
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* SubBytes */\
-  asm ("pxor       xmm"tostr(b0)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-#define ROUNDS_P(){\
-  asm ("xor rax, rax");\
-  asm ("xor rbx, rbx");\
-  asm ("add bl, 2");\
-  asm ("1:");\
-  /* AddRoundConstant P1024 */\
-  asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\
-  /* ShiftBytes P1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm8,  [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm9,  [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  \
-  /* AddRoundConstant P1024 */\
-  asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\
-  /* ShiftBytes P1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  asm ("add al, 4");\
-  asm ("add bl, 4");\
-  asm ("mov rcx, rax");\
-  asm ("sub cl, 28");\
-  asm ("jb 1b");\
-}
-
-#define ROUNDS_Q(){\
-  asm ("xor rax, rax");\
-  asm ("xor rbx, rbx");\
-  asm ("add bl, 2");\
-  asm ("2:");\
-  /* AddRoundConstant Q1024 */\
-  asm ("movaps xmm1,  [ALL_FF]");\
-  asm ("pxor xmm8,  xmm1");\
-  asm ("pxor xmm9,  xmm1");\
-  asm ("pxor xmm10, xmm1");\
-  asm ("pxor xmm11, xmm1");\
-  asm ("pxor xmm12, xmm1");\
-  asm ("pxor xmm13, xmm1");\
-  asm ("pxor xmm14, xmm1");\
-  asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\
-  /* ShiftBytes Q1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm8,  [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm9,  [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\
-  asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  \
-  /* AddConstant */\
-  asm ("movaps xmm9,  [ALL_FF]");\
-  asm ("pxor xmm0,  xmm9");\
-  asm ("pxor xmm1,  xmm9");\
-  asm ("pxor xmm2,  xmm9");\
-  asm ("pxor xmm3,  xmm9");\
-  asm ("pxor xmm4,  xmm9");\
-  asm ("pxor xmm5,  xmm9");\
-  asm ("pxor xmm6,  xmm9");\
-  asm ("pxor xmm7,  [ROUND_CONST_Q+ebx*8]");\
-  /* ShiftBytes Q1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\
-  asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  asm ("add al, 4");\
-  asm ("add bl, 4");\
-  asm ("mov rcx, rax");\
-  asm ("sub cl, 28");\
-  asm ("jb 2b");\
-}
-
-/* Matrix Transpose
- * input is a 1024-bit state with two columns in one xmm
- * output is a 1024-bit state with two rows in one xmm
- * inputs: i0-i7
- * outputs: i0-i7
- * clobbers: t0-t7
- */
-#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-  \
-  asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\
-  asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\
-  asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\
-  \
-  /* continue with unpack using 4 temp registers */\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\
-  asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\
-  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\
-  asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  \
-  /* shuffle with immediate */\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
-  asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\
-  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
-  asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\
-  \
-  /* continue with unpack */\
-  asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\
-  asm ("punpckldq xmm"tostr(i0)",  xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(t4)",  xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\
-  asm ("punpckldq xmm"tostr(t0)",  xmm"tostr(t1)"");\
-  asm ("punpckhdq xmm"tostr(t5)",  xmm"tostr(t1)"");\
-  asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\
-  asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\
-  asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\
-  asm ("punpckhdq xmm"tostr(t6)",  xmm"tostr(i6)"");\
-  asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckldq xmm"tostr(t2)",  xmm"tostr(t3)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(t7)",  xmm"tostr(t3)"");\
-  \
-  /* there are now 2 rows in each xmm */\
-  /* unpack to get 1 row of CV in each xmm */\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\
-  asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\
-  /* transpose done */\
-}/**/
-
-/* Matrix Transpose Inverse
- * input is a 1024-bit state with two rows in one xmm
- * output is a 1024-bit state with two columns in one xmm
- * inputs: i0-i7
- * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
- * clobbers: t0-t4
- */
-#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
-  /*  transpose matrix to get output format */\
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\
-  asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\
-  /* load transpose mask into a register, because it will be used 8 times */\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\
-  /* continue with unpack using 4 temp registers */\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\
-  asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\
-  \
-  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\
-  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\
-  asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\
-  asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\
-  asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\
-  /* shuffle with immediate */\
-  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
-  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\
-  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
-  asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\
-  /* continue with unpack */\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\
-  asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\
-  asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\
-  asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\
-  asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\
-  asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\
-  /* transpose done */\
-}/**/
-
-
-void INIT(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm8 - xmm15 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm9,  [rdi+1*16]");
-  asm ("movaps xmm10, [rdi+2*16]");
-  asm ("movaps xmm11, [rdi+3*16]");
-  asm ("movaps xmm12, [rdi+4*16]");
-  asm ("movaps xmm13, [rdi+5*16]");
-  asm ("movaps xmm14, [rdi+6*16]");
-  asm ("movaps xmm15, [rdi+7*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm8");
-  asm ("movaps [rdi+1*16], xmm9");
-  asm ("movaps [rdi+2*16], xmm10");
-  asm ("movaps [rdi+3*16], xmm11");
-  asm ("movaps [rdi+4*16], xmm12");
-  asm ("movaps [rdi+5*16], xmm13");
-  asm ("movaps [rdi+6*16], xmm14");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF1024(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm8 - xmm15 (Q = message) */
-  asm ("movaps xmm8,  [rsi+0*16]");
-  asm ("movaps xmm9,  [rsi+1*16]");
-  asm ("movaps xmm10, [rsi+2*16]");
-  asm ("movaps xmm11, [rsi+3*16]");
-  asm ("movaps xmm12, [rsi+4*16]");
-  asm ("movaps xmm13, [rsi+5*16]");
-  asm ("movaps xmm14, [rsi+6*16]");
-  asm ("movaps xmm15, [rsi+7*16]");
-
-  /* transform message M from column ordering into row ordering */
-  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-
-  /* store message M (Q input) for later */
-  asm ("movaps [QTEMP+0*16], xmm8");
-  asm ("movaps [QTEMP+1*16], xmm9");
-  asm ("movaps [QTEMP+2*16], xmm10");
-  asm ("movaps [QTEMP+3*16], xmm11");
-  asm ("movaps [QTEMP+4*16], xmm12");
-  asm ("movaps [QTEMP+5*16], xmm13");
-  asm ("movaps [QTEMP+6*16], xmm14");
-  asm ("movaps [QTEMP+7*16], xmm15");
-
-  /* xor CV to message to get P input */
-  /* result: CV+M in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* compute permutation P */
-  /* result: P(CV+M) in xmm8...xmm15 */
-  ROUNDS_P();
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV+M)+CV in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* store P(CV+M)+CV */
-  asm ("movaps [rdi+0*16], xmm8");
-  asm ("movaps [rdi+1*16], xmm9");
-  asm ("movaps [rdi+2*16], xmm10");
-  asm ("movaps [rdi+3*16], xmm11");
-  asm ("movaps [rdi+4*16], xmm12");
-  asm ("movaps [rdi+5*16], xmm13");
-  asm ("movaps [rdi+6*16], xmm14");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  /* load message M (Q input) into xmm8-15 */
-  asm ("movaps xmm8,  [QTEMP+0*16]");
-  asm ("movaps xmm9,  [QTEMP+1*16]");
-  asm ("movaps xmm10, [QTEMP+2*16]");
-  asm ("movaps xmm11, [QTEMP+3*16]");
-  asm ("movaps xmm12, [QTEMP+4*16]");
-  asm ("movaps xmm13, [QTEMP+5*16]");
-  asm ("movaps xmm14, [QTEMP+6*16]");
-  asm ("movaps xmm15, [QTEMP+7*16]");
-
-  /* compute permutation Q */
-  /* result: Q(M) in xmm8...xmm15 */
-  ROUNDS_Q();
-
-  /* xor Q output */
-  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm8");
-  asm ("movaps [rdi+1*16], xmm9");
-  asm ("movaps [rdi+2*16], xmm10");
-  asm ("movaps [rdi+3*16], xmm11");
-  asm ("movaps [rdi+4*16], xmm12");
-  asm ("movaps [rdi+5*16], xmm13");
-  asm ("movaps [rdi+6*16], xmm14");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF1024(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8 - xmm15 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm9,  [rdi+1*16]");
-  asm ("movaps xmm10, [rdi+2*16]");
-  asm ("movaps xmm11, [rdi+3*16]");
-  asm ("movaps xmm12, [rdi+4*16]");
-  asm ("movaps xmm13, [rdi+5*16]");
-  asm ("movaps xmm14, [rdi+6*16]");
-  asm ("movaps xmm15, [rdi+7*16]");
-
-  /* compute permutation P */
-  /* result: P(CV) in xmm8...xmm15 */
-  ROUNDS_P();
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* transpose CV back from row ordering to column ordering */
-  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
-  Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+4*16], xmm0");
-  asm ("movaps [rdi+5*16], xmm6");
-  asm ("movaps [rdi+6*16], xmm13");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-#endif
-
diff --git a/algo/groestl/aes_ni/groestl-asm-avx.h b/algo/groestl/aes_ni/groestl-asm-avx.h
deleted file mode 100644
index 6e8be1b..0000000
--- a/algo/groestl/aes_ni/groestl-asm-avx.h
+++ /dev/null
@@ -1,1105 +0,0 @@
-/* groestl-asm-avx.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl.h"
-
-/* global variables  */
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
-__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
-
-/* temporary variables  */
-__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2v2(i, j, k, z){\
-  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* spill values y_4, y_5 to memory */\
-  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
-
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-\
-  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("vmovaps xmm12, [rdi+0*16]");
-  asm ("vmovaps xmm13, [rdi+1*16]");
-  asm ("vmovaps xmm14, [rdi+2*16]");
-  asm ("vmovaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("vmovaps [rdi+0*16], xmm12");
-  asm ("vmovaps [rdi+1*16], xmm2");
-  asm ("vmovaps [rdi+2*16], xmm6");
-  asm ("vmovaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("vmovaps xmm12, [rsi+0*16]");
-  asm ("vmovaps xmm13, [rsi+1*16]");
-  asm ("vmovaps xmm14, [rsi+2*16]");
-  asm ("vmovaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
-  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
-  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
-  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, xmm8");
-  asm ("vpxor xmm1, xmm1, xmm10");
-  asm ("vpxor xmm2, xmm2, xmm12");
-  asm ("vpxor xmm3, xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
-  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
-  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
-  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("vmovaps [rdi+0*16], xmm0");
-  asm ("vmovaps [rdi+1*16], xmm1");
-  asm ("vmovaps [rdi+2*16], xmm2");
-  asm ("vmovaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("vmovaps xmm8,  [rdi+0*16]");
-  asm ("vmovaps xmm10, [rdi+1*16]");
-  asm ("vmovaps xmm12, [rdi+2*16]");
-  asm ("vmovaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
-  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
-  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
-  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("vmovaps [rdi+2*16], xmm9");
-  asm ("vmovaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-#endif
-
-#if (LENGTH > 256)
-
-#define SET_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\
-  ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ALL_FF)[2] = 0x0000000000000000ULL;\
-  ((u64*)ALL_FF)[3] = 0x0000000000000000ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[2] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[3] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\
-    ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\
-    ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
-  }\
-}while(0);
-
-#define Push_All_Regs() do{\
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");\
-}while(0);
-
-#define Pop_All_Regs() do{\
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");\
-}while(0);
-
-/* AVX MUL2
- * ymm[i] will be multiplied by 2
- * ymm[j] will be lost
- * ymm[k] has to be all 0x1b
- * ymm[z] has to be zero
- * clobbers: t2, t3 */
-#define VMUL2(i, j, k, z, ih, jh){\
-    asm("vextractf128 xmm"tostr(ih)", ymm"tostr(i)", 1");\
-    asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
-    asm("vpcmpgtb xmm"tostr(jh)", xmm"tostr(z)", xmm"tostr(ih)"");\
-    asm("vpaddb xmm"tostr(i)",  xmm"tostr(i)",  xmm"tostr(i)"");\
-    asm("vpaddb xmm"tostr(ih)", xmm"tostr(ih)", xmm"tostr(ih)"");\
-    asm("vinsertf128 ymm"tostr(j)", ymm"tostr(j)", xmm"tostr(jh)", 1");\
-    asm("vinsertf128 ymm"tostr(i)",  ymm"tostr(i)",  xmm"tostr(ih)", 1");\
-    asm("vandpd ymm"tostr(j)", ymm"tostr(j)", ymm"tostr(k)"");\
-    asm("vxorpd ymm"tostr(i)",  ymm"tostr(i)",  ymm"tostr(j)"");\
-}/**/
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2v2(i, j, k, z){\
-  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2v3(i, j, k, z){\
-  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* ymm"tostr(8..ymm"tostr(15 = a2 a3... a0 a1 */\
-  asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a2)"");\
-  asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a3)"");\
-  asm("vmovdqa ymm"tostr(b2)", ymm"tostr(a4)"");\
-  asm("vmovdqa ymm"tostr(b3)", ymm"tostr(a5)"");\
-  asm("vmovdqa ymm"tostr(b4)", ymm"tostr(a6)"");\
-  asm("vmovdqa ymm"tostr(b5)", ymm"tostr(a7)"");\
-  asm("vmovdqa ymm"tostr(b6)", ymm"tostr(a0)"");\
-  asm("vmovdqa ymm"tostr(b7)", ymm"tostr(a1)"");\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a1)"");\
-  asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a2)"");\
-  asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a3)"");\
-  asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a4)"");\
-  asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a5)"");\
-  asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(a6)"");\
-  asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(a7)"");\
-  asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs ymm8, ymm9, ymm10 by adding t_i*/\
-  asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a4)"");\
-  asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a5)"");\
-  asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a6)"");\
-  asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a7)"");\
-  asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a0)"");\
-  asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a1)"");\
-  asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a2)"");\
-  asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a3)"");\
-  \
-  asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a6)"");\
-  asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a7)"");\
-  asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a0)"");\
-  asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a1)"");\
-  asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a2)"");\
-  asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a3)"");\
-  asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a4)"");\
-  asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a5)"");\
-  \
-  /* spill values y_4, y_5 to memory */\
-  asm("vmovaps [TEMP+0*32], ymm"tostr(b0)"");\
-  asm("vmovaps [TEMP+1*32], ymm"tostr(b1)"");\
-  asm("vmovaps [TEMP+2*32], ymm"tostr(b2)"");\
-  asm("vmovaps [TEMP+3*32], ymm"tostr(b3)"");\
-  asm("vmovaps [TEMP+4*32], ymm"tostr(b4)"");\
-  \
-  /* save values t0, t1, t2 to ymm8, ymm9 and memory */\
-  asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a0)"");\
-  asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a1)"");\
-  asm("vmovaps [TEMP+5*32], ymm"tostr(a2)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a3)"");\
-  asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a4)"");\
-  asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a5)"");\
-  asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a6)"");\
-  asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a7)"");\
-  asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b0)"");\
-  asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b1)"");\
-  asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", [TEMP+5*32]");\
-  \
-  /*compute z_i : double x_i using temp ymm8 and 1B ymm9 */\
-  asm("vmovaps ymm"tostr(b1)", [ALL_1B]");\
-  asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(b2)"");\
-  VMUL2(a7, b0, b1, b2, b3, b4);\
-  VMUL2(a6, b0, b1, b2, b3, b4);\
-  VMUL2(a5, b0, b1, b2, b3, b4);\
-  VMUL2(a4, b0, b1, b2, b3, b4);\
-  VMUL2(a3, b0, b1, b2, b3, b4);\
-  VMUL2(a2, b0, b1, b2, b3, b4);\
-  VMUL2(a1, b0, b1, b2, b3, b4);\
-  VMUL2(a0, b0, b1, b2, b3, b4);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", [TEMP+0*32]");\
-  asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", [TEMP+1*32]");\
-  asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", [TEMP+2*32]");\
-  asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", [TEMP+3*32]");\
-  asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", [TEMP+4*32]");\
-  asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b5)"");\
-  asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b6)"");\
-  asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b7)"");\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2, b3, b4);\
-  VMUL2(a1, b0, b1, b2, b3, b4);\
-  VMUL2(a2, b0, b1, b2, b3, b4);\
-  VMUL2(a3, b0, b1, b2, b3, b4);\
-  VMUL2(a4, b0, b1, b2, b3, b4);\
-  VMUL2(a5, b0, b1, b2, b3, b4);\
-  VMUL2(a6, b0, b1, b2, b3, b4);\
-  VMUL2(a7, b0, b1, b2, b3, b4);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  asm("vxorpd ymm"tostr(b0)", ymm"tostr(a3)", [TEMP+0*32]");\
-  asm("vxorpd ymm"tostr(b1)", ymm"tostr(a4)", [TEMP+1*32]");\
-  asm("vxorpd ymm"tostr(b2)", ymm"tostr(a5)", [TEMP+2*32]");\
-  asm("vxorpd ymm"tostr(b3)", ymm"tostr(a6)", [TEMP+3*32]");\
-  asm("vxorpd ymm"tostr(b4)", ymm"tostr(a7)", [TEMP+4*32]");\
-  asm("vxorpd ymm"tostr(b5)", ymm"tostr(a0)", ymm"tostr(b5)"");\
-  asm("vxorpd ymm"tostr(b6)", ymm"tostr(a1)", ymm"tostr(b6)"");\
-  asm("vxorpd ymm"tostr(b7)", ymm"tostr(a2)", ymm"tostr(b7)"");\
-}/*MixBytes*/
-
-/* AVX SubShift
- * inputs:
- * * i
- * * c0 (must be 0)
- * * ShiftP
- * * ShiftQ
- * output i = S[Shift(i_1, ShiftQ)|Shift(i_0, ShiftP)]
- * clobbers: t0
- * */
-#define SubShift(i, t0, c0, ShiftP, ShiftQ){\
-    asm("vextractf128 xmm"tostr(t0)", ymm"tostr(i)", 1");\
-    asm("vpshufb xmm"tostr(i)",  xmm"tostr(i)",  [SUBSH_MASK+"tostr(ShiftP)"*16]");\
-    asm("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", [SUBSH_MASK+"tostr(ShiftQ)"*16]");\
-    asm("vaesenclast xmm"tostr(i)",  xmm"tostr(i)",  xmm"tostr(c0)"");\
-    asm("vaesenclast xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(c0)"");\
-    asm("vinsertf128 ymm"tostr(i)",  ymm"tostr(i)",  xmm"tostr(t0)", 1");\
-}/**/
-
-/* one round
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-    /* ShiftBytes + SubBytes */\
-    asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\
-    SubShift(a0, b1, b0, 0, 1);\
-    SubShift(a1, b1, b0, 1, 3);\
-    SubShift(a2, b1, b0, 2, 5);\
-    SubShift(a3, b1, b0, 3, 7);\
-    SubShift(a4, b1, b0, 4, 0);\
-    SubShift(a5, b1, b0, 5, 2);\
-    SubShift(a6, b1, b0, 6, 4);\
-    SubShift(a7, b1, b0, 7, 6);\
-    /* MixBytes */\
-    MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-#define ROUNDS_P_Q(){\
-  asm ("xor rax, rax");\
-  asm ("1:");\
-  /* AddRoundConstant */\
-  asm ("vxorpd ymm6, ymm6, ymm6");\
-  asm ("vinsertf128 ymm7, ymm6, [ROUND_CONST_Q+eax*8], 1");\
-  asm ("vinsertf128 ymm6, ymm6, [ALL_FF], 1");\
-  asm ("vinsertf128 ymm0, ymm6, [ROUND_CONST_P+eax*8], 0");\
-  asm ("vxorpd ymm0, ymm8,  ymm0");\
-  asm ("vxorpd ymm1, ymm9,  ymm6");\
-  asm ("vxorpd ymm2, ymm10, ymm6");\
-  asm ("vxorpd ymm3, ymm11, ymm6");\
-  asm ("vxorpd ymm4, ymm12, ymm6");\
-  asm ("vxorpd ymm5, ymm13, ymm6");\
-  asm ("vxorpd ymm6, ymm14, ymm6");\
-  asm ("vxorpd ymm7, ymm15, ymm7");\
-  /* SubBytes + ShiftBytes + MixBytes */\
-  SUBSHIFTMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  asm ("add al, 2");\
-  asm ("mov rbx, rax");\
-  asm ("sub bl, 28");\
-  asm ("jb 1b");\
-}
-
-/* Matrix Transpose
- * input is a 1024-bit state with two columns in one xmm
- * output is a 1024-bit state with two rows in one xmm
- * inputs: i0-i7
- * outputs: i0-i7
- * clobbers: t0-t7
- */
-#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i5)", xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i7)", xmm"tostr(i7)", xmm"tostr(t0)"");\
-\
-  /* continue with unpack */\
-  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhwd xmm"tostr(t1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpckhwd xmm"tostr(t2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklwd xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-\
-  /* shuffle with immediate */\
-  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-  asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
-  asm ("vpshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\
-  asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
-  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
-  asm ("vpshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\
-\
-  /* continue with unpack */\
-  asm ("vpunpckhdq xmm"tostr(t4)", xmm"tostr(i0)",  xmm"tostr(i2)"");\
-  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)",  xmm"tostr(i2)"");\
-  asm ("vpunpckhdq xmm"tostr(t5)", xmm"tostr(t0)",  xmm"tostr(t1)"");\
-  asm ("vpunpckldq xmm"tostr(t0)", xmm"tostr(t0)",  xmm"tostr(t1)"");\
-  asm ("vpunpckhdq xmm"tostr(t6)", xmm"tostr(i4)",  xmm"tostr(i6)"");\
-  asm ("vpunpckldq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\
-  asm ("vpunpckhdq xmm"tostr(t7)", xmm"tostr(t2)",  xmm"tostr(t3)"");\
-  asm ("vpunpckldq xmm"tostr(t2)", xmm"tostr(t2)",  xmm"tostr(t3)"");\
-\
-  /* there are now 2 rows in each xmm */\
-  /* unpack to get 1 row of CV in each xmm */\
-  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(t0)", xmm"tostr(t2)"");\
-  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(t0)", xmm"tostr(t2)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(t4)", xmm"tostr(t6)"");\
-  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(t4)", xmm"tostr(t6)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(t5)", xmm"tostr(t7)"");\
-  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(t5)", xmm"tostr(t7)"");\
-  /* transpose done */\
-}/**/
-
-/* Matrix Transpose Inverse
- * input is a 1024-bit state with two rows in one xmm
- * output is a 1024-bit state with two columns in one xmm
- * inputs: i0-i7
- * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
- * clobbers: t0-t4
- */
-#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
-  asm ("vmovaps xmm"tostr(o0)", [TRANSP_MASK]");\
-  /*  transpose matrix to get output format */\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhqdq xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpckhqdq xmm"tostr(t1)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(t2)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-  /* load transpose mask into a register, because it will be used 8 times */\
-  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(o0)"");\
-  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(o0)"");\
-  asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(o0)"");\
-  asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(o0)"");\
-  asm ("vpshufb xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(o0)"");\
-  asm ("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(o0)"");\
-  asm ("vpshufb xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(o0)"");\
-  asm ("vpshufb xmm"tostr(t2)", xmm"tostr(t2)", xmm"tostr(o0)"");\
-  /* continue with unpack */\
-  asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i4)", xmm"tostr(i6)"");\
-  asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\
-  asm ("vpunpckhwd xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckhwd xmm"tostr(o2)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("vpunpcklwd xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("vpunpckhwd xmm"tostr(t4)", xmm"tostr(t1)", xmm"tostr(t2)"");\
-  asm ("vpunpcklwd xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(t2)"");\
-  /* shuffle with immediate */\
-  asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
-  asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
-  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("vpshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\
-  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("vpshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\
-  asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
-  asm ("vpshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\
-  /* continue with unpack */\
-  asm ("vpunpckhdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpckhdq xmm"tostr(i3)", xmm"tostr(o0)", xmm"tostr(t3)"");\
-  asm ("vpunpckldq xmm"tostr(o0)", xmm"tostr(o0)", xmm"tostr(t3)"");\
-  asm ("vpunpckhdq xmm"tostr(i5)", xmm"tostr(o1)", xmm"tostr(t1)"");\
-  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t1)"");\
-  asm ("vpunpckhdq xmm"tostr(i7)", xmm"tostr(o2)", xmm"tostr(t4)"");\
-  asm ("vpunpckldq xmm"tostr(o2)", xmm"tostr(o2)", xmm"tostr(t4)"");\
-  /* transpose done */\
-}/**/
-
-
-void INIT(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm8 - xmm15 */
-  asm ("vmovaps xmm8,  [rdi+0*16]");
-  asm ("vmovaps xmm9,  [rdi+1*16]");
-  asm ("vmovaps xmm10, [rdi+2*16]");
-  asm ("vmovaps xmm11, [rdi+3*16]");
-  asm ("vmovaps xmm12, [rdi+4*16]");
-  asm ("vmovaps xmm13, [rdi+5*16]");
-  asm ("vmovaps xmm14, [rdi+6*16]");
-  asm ("vmovaps xmm15, [rdi+7*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-
-  /* store transposed IV */
-  asm ("vmovaps [rdi+0*16], xmm8");
-  asm ("vmovaps [rdi+1*16], xmm9");
-  asm ("vmovaps [rdi+2*16], xmm10");
-  asm ("vmovaps [rdi+3*16], xmm11");
-  asm ("vmovaps [rdi+4*16], xmm12");
-  asm ("vmovaps [rdi+5*16], xmm13");
-  asm ("vmovaps [rdi+6*16], xmm14");
-  asm ("vmovaps [rdi+7*16], xmm15");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF1024(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm8...xmm15 (Q = message) */
-  asm ("vmovaps xmm0, [rsi+0*16]");
-  asm ("vmovaps xmm1, [rsi+1*16]");
-  asm ("vmovaps xmm2, [rsi+2*16]");
-  asm ("vmovaps xmm3, [rsi+3*16]");
-  asm ("vmovaps xmm4, [rsi+4*16]");
-  asm ("vmovaps xmm5, [rsi+5*16]");
-  asm ("vmovaps xmm6, [rsi+6*16]");
-  asm ("vmovaps xmm7, [rsi+7*16]");
-
-  /* transform message M from column ordering into row ordering */
-  Matrix_Transpose(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8...xmm15 */
-  asm ("vpxor xmm8,  xmm0, [rdi+0*16]");
-  asm ("vpxor xmm9,  xmm1, [rdi+1*16]");
-  asm ("vpxor xmm10, xmm2, [rdi+2*16]");
-  asm ("vpxor xmm11, xmm3, [rdi+3*16]");
-  asm ("vpxor xmm12, xmm4, [rdi+4*16]");
-  asm ("vpxor xmm13, xmm5, [rdi+5*16]");
-  asm ("vpxor xmm14, xmm6, [rdi+6*16]");
-  asm ("vpxor xmm15, xmm7, [rdi+7*16]");
-
-  /* generate AVX registers with Q in high and P in low 128 bits */
-  asm ("vinsertf128 ymm8,  ymm8,  xmm0, 1");
-  asm ("vinsertf128 ymm9,  ymm9,  xmm1, 1");
-  asm ("vinsertf128 ymm10, ymm10, xmm2, 1");
-  asm ("vinsertf128 ymm11, ymm11, xmm3, 1");
-  asm ("vinsertf128 ymm12, ymm12, xmm4, 1");
-  asm ("vinsertf128 ymm13, ymm13, xmm5, 1");
-  asm ("vinsertf128 ymm14, ymm14, xmm6, 1");
-  asm ("vinsertf128 ymm15, ymm15, xmm7, 1");
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* extract output of Q to xmm0...xmm7 */
-  asm ("vextractf128 xmm0, ymm8,  1");
-  asm ("vextractf128 xmm1, ymm9,  1");
-  asm ("vextractf128 xmm2, ymm10, 1");
-  asm ("vextractf128 xmm3, ymm11, 1");
-  asm ("vextractf128 xmm4, ymm12, 1");
-  asm ("vextractf128 xmm5, ymm13, 1");
-  asm ("vextractf128 xmm6, ymm14, 1");
-  asm ("vextractf128 xmm7, ymm15, 1");
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm8...xmm15 */
-  asm ("vpxor xmm8,  xmm8,  xmm0");
-  asm ("vpxor xmm9,  xmm9,  xmm1");
-  asm ("vpxor xmm10, xmm10, xmm2");
-  asm ("vpxor xmm11, xmm11, xmm3");
-  asm ("vpxor xmm12, xmm12, xmm4");
-  asm ("vpxor xmm13, xmm13, xmm5");
-  asm ("vpxor xmm14, xmm14, xmm6");
-  asm ("vpxor xmm15, xmm15, xmm7");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */
-  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
-  asm ("vpxor xmm9,  xmm9,  [rdi+1*16]");
-  asm ("vpxor xmm10, xmm10, [rdi+2*16]");
-  asm ("vpxor xmm11, xmm11, [rdi+3*16]");
-  asm ("vpxor xmm12, xmm12, [rdi+4*16]");
-  asm ("vpxor xmm13, xmm13, [rdi+5*16]");
-  asm ("vpxor xmm14, xmm14, [rdi+6*16]");
-  asm ("vpxor xmm15, xmm15, [rdi+7*16]");
-
-  /* store CV */
-  asm ("vmovaps [rdi+0*16], xmm8");
-  asm ("vmovaps [rdi+1*16], xmm9");
-  asm ("vmovaps [rdi+2*16], xmm10");
-  asm ("vmovaps [rdi+3*16], xmm11");
-  asm ("vmovaps [rdi+4*16], xmm12");
-  asm ("vmovaps [rdi+5*16], xmm13");
-  asm ("vmovaps [rdi+6*16], xmm14");
-  asm ("vmovaps [rdi+7*16], xmm15");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF1024(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  asm ("vpxor xmm0, xmm0, xmm0");
-
-  /* load CV into registers xmm8...xmm15 */
-  asm ("vmovaps xmm8,  [rdi+0*16]");
-  asm ("vmovaps xmm9,  [rdi+1*16]");
-  asm ("vmovaps xmm10, [rdi+2*16]");
-  asm ("vmovaps xmm11, [rdi+3*16]");
-  asm ("vmovaps xmm12, [rdi+4*16]");
-  asm ("vmovaps xmm13, [rdi+5*16]");
-  asm ("vmovaps xmm14, [rdi+6*16]");
-  asm ("vmovaps xmm15, [rdi+7*16]");
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8...xmm15 */
-  ROUNDS_P_Q();
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8...xmm15 */
-  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
-  asm ("vpxor xmm9,  xmm9,  [rdi+1*16]");
-  asm ("vpxor xmm10, xmm10, [rdi+2*16]");
-  asm ("vpxor xmm11, xmm11, [rdi+3*16]");
-  asm ("vpxor xmm12, xmm12, [rdi+4*16]");
-  asm ("vpxor xmm13, xmm13, [rdi+5*16]");
-  asm ("vpxor xmm14, xmm14, [rdi+6*16]");
-  asm ("vpxor xmm15, xmm15, [rdi+7*16]");
-
-  /* transpose CV back from row ordering to column ordering */
-  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
-  Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7);
-
-  /* we only need to return the truncated half of the state */
-  asm ("vmovaps [rdi+4*16], xmm0");
-  asm ("vmovaps [rdi+5*16], xmm6");
-  asm ("vmovaps [rdi+6*16], xmm13");
-  asm ("vmovaps [rdi+7*16], xmm15");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-#endif
-
diff --git a/algo/groestl/aes_ni/groestl-asm-vperm.h b/algo/groestl/aes_ni/groestl-asm-vperm.h
deleted file mode 100644
index f8ae27c..0000000
--- a/algo/groestl/aes_ni/groestl-asm-vperm.h
+++ /dev/null
@@ -1,1397 +0,0 @@
-/* groestl-asm-vperm.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl.h"
-
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
-__attribute__ ((aligned (16))) unsigned char ALL_15[16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_63[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
-  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
-  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
-  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
-  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
-  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
-  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
-  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
-  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
-  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
-  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
-  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
-  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
-  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
-  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
-  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
-  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
-  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
-  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
-  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
-  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
-  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
-  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
-  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
-  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
-  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
-/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
-  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
-  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
-  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
-  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
-  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
-  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
-  asm ("psrld  xmm"tostr(t0)", 4");\
-  asm ("psrld  xmm"tostr(t1)", 4");\
-  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
-  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
-  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
-  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
-  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
-  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
-  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
-  asm ("psrld  xmm"tostr(t0)",  4");\
-  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
-  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
-  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
-  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
-  /* 1 */\
-  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
-  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
-  \
-  /* 2 */\
-  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
-  \
-  /* 4 */\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
-  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
-  \
-  /* 3 */\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
-  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
-  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
-  \
-  /* 5 */\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
-  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
-  \
-  /* 6 */\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  /* 7 */\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  \
-  /* 8 */\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
-  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* 9 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 10 */\
-  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
-  \
-  /* 11 */\
-  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
-  \
-  /* 12 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 13 */\
-  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
-  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/**/
-
-#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}/**/
-
-#define Push_All_Regs(){\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}/**/
-
-#define Pop_All_Regs(){\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}/**/
-
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-\
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
-  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
-  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("pxor xmm1, [ALL_15]");\
-  asm ("pxor xmm2, [ALL_15]");\
-  asm ("pxor xmm3, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
-  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_Lx], xmm0");\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-#endif
-
-#if (LENGTH > 256)
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\
-  ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x0f0e0d0c0b0a0908ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0807060504030201ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x000f0e0d0c0b0a09ULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x0908070605040302ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x01000f0e0d0c0b0aULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a09080706050403ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0201000f0e0d0c0bULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0a090807060504ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x030201000f0e0d0cULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0c0b0a0908070605ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x04030201000f0e0dULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0d0c0b0a09080706ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x0504030201000f0eULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x0201000f0e0d0c0bULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x0a09080706050403ULL;\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL)  ^ 0xf0e0d0c0b0a09080ULL;\
-    ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL)  ^ 0x0f1f2f3f4f5f6f7fULL;\
-    ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-  }\
-}/**/
-
-#define Push_All_Regs(){\
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");\
-}/**/
-
-#define Pop_All_Regs(){\
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");\
-}/**/
-
-/* one round
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* SubBytes + Multiplication */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-#define ROUNDS_P(){\
-  asm ("xor rax, rax");\
-  asm ("xor rbx, rbx");\
-  asm ("add bl, 2");\
-  asm ("1:");\
-  /* AddRoundConstant P1024 */\
-  asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\
-  /* ShiftBytes P1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm8,  [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm9,  [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  VPERM_Add_Constant(0, 1, 2, 3, 4, 5, 6, 7, ALL_15, 8);\
-  /* AddRoundConstant P1024 */\
-  asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\
-  /* ShiftBytes P1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-  asm ("add al, 4");\
-  asm ("add bl, 4");\
-  asm ("mov rcx, rax");\
-  asm ("sub cl, 28");\
-  asm ("jb 1b");\
-}/**/
-
-#define ROUNDS_Q(){\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\
-  asm ("xor rax, rax");\
-  asm ("xor rbx, rbx");\
-  asm ("add bl, 2");\
-  asm ("2:");\
-  /* AddRoundConstant Q1024 */\
-  asm ("movaps xmm1,  [ALL_FF]");\
-  asm ("pxor xmm8,  xmm1");\
-  asm ("pxor xmm9,  xmm1");\
-  asm ("pxor xmm10, xmm1");\
-  asm ("pxor xmm11, xmm1");\
-  asm ("pxor xmm12, xmm1");\
-  asm ("pxor xmm13, xmm1");\
-  asm ("pxor xmm14, xmm1");\
-  asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\
-  /* ShiftBytes Q1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm8,  [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm9,  [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\
-  asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  /* AddRoundConstant Q1024 */\
-  asm ("movaps xmm9,  [ALL_FF]");\
-  asm ("pxor xmm0,  xmm9");\
-  asm ("pxor xmm1,  xmm9");\
-  asm ("pxor xmm2,  xmm9");\
-  asm ("pxor xmm3,  xmm9");\
-  asm ("pxor xmm4,  xmm9");\
-  asm ("pxor xmm5,  xmm9");\
-  asm ("pxor xmm6,  xmm9");\
-  asm ("pxor xmm7,  [ROUND_CONST_Q+ebx*8]");\
-  /* ShiftBytes Q1024 + pre-AESENCLAST */\
-  asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\
-  asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\
-  asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\
-  asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\
-  asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\
-  /* SubBytes + MixBytes */\
-  SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  asm ("add al, 4");\
-  asm ("add bl, 4");\
-  asm ("mov rcx, rax");\
-  asm ("sub cl, 28");\
-  asm ("jb 2b");\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\
-}/**/
-
-
-/* Matrix Transpose
- * input is a 1024-bit state with two columns in one xmm
- * output is a 1024-bit state with two rows in one xmm
- * inputs: i0-i7
- * outputs: i0-i7
- * clobbers: t0-t7
- */
-#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\
-  asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\
-  asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\
-\
-  /* continue with unpack using 4 temp registers */\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\
-  asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\
-  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\
-  asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-\
-  /* shuffle with immediate */\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
-  asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\
-  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
-  asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\
-\
-  /* continue with unpack */\
-  asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\
-  asm ("punpckldq xmm"tostr(i0)",  xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(t4)",  xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\
-  asm ("punpckldq xmm"tostr(t0)",  xmm"tostr(t1)"");\
-  asm ("punpckhdq xmm"tostr(t5)",  xmm"tostr(t1)"");\
-  asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\
-  asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\
-  asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\
-  asm ("punpckhdq xmm"tostr(t6)",  xmm"tostr(i6)"");\
-  asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckldq xmm"tostr(t2)",  xmm"tostr(t3)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(t7)",  xmm"tostr(t3)"");\
-\
-  /* there are now 2 rows in each xmm */\
-  /* unpack to get 1 row of CV in each xmm */\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\
-  asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\
-  /* transpose done */\
-}/**/
-
-/* Matrix Transpose Inverse
- * input is a 1024-bit state with two rows in one xmm
- * output is a 1024-bit state with two columns in one xmm
- * inputs: i0-i7
- * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
- * clobbers: t0-t4
- */
-#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
-  /*  transpose matrix to get output format */\
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\
-  asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\
-  /* load transpose mask into a register, because it will be used 8 times */\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\
-  asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\
-  /* continue with unpack using 4 temp registers */\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\
-  asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\
-  \
-  asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\
-  asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\
-  asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\
-  asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\
-  asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\
-  /* shuffle with immediate */\
-  asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\
-  asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\
-  asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\
-  asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\
-  /* continue with unpack */\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\
-  asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\
-  asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\
-  asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\
-  asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\
-  asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\
-  /* transpose done */\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  asm ("movaps xmm0, [ROUND_CONST_P+"tostr(i)"*16]");\
-  asm ("movaps xmm1, [ROUND_CONST_P+"tostr(j)"*16]");\
-  asm ("movaps xmm2, [ROUND_CONST_Q+"tostr(i)"*16]");\
-  asm ("movaps xmm3, [ROUND_CONST_Q+"tostr(j)"*16]");\
-  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm2, [ALL_15]");\
-  asm ("pxor xmm3, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_P+"tostr(i)"*16], xmm0");\
-  asm ("movaps [ROUND_CONST_P+"tostr(j)"*16], xmm1");\
-  asm ("movaps [ROUND_CONST_Q+"tostr(i)"*16], xmm2");\
-  asm ("movaps [ROUND_CONST_Q+"tostr(j)"*16], xmm3");\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-  VPERM_Transform_RoundConst_CNT2(10, 11);\
-  VPERM_Transform_RoundConst_CNT2(12, 13);\
-  asm ("movaps xmm0, [ALL_FF]");\
-  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("movaps [ALL_FF], xmm0");\
-}/**/
-
-
-void INIT(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm8 - xmm15 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm9,  [rdi+1*16]");
-  asm ("movaps xmm10, [rdi+2*16]");
-  asm ("movaps xmm11, [rdi+3*16]");
-  asm ("movaps xmm12, [rdi+4*16]");
-  asm ("movaps xmm13, [rdi+5*16]");
-  asm ("movaps xmm14, [rdi+6*16]");
-  asm ("movaps xmm15, [rdi+7*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  VPERM_Transform_State( 8,  9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm8");
-  asm ("movaps [rdi+1*16], xmm9");
-  asm ("movaps [rdi+2*16], xmm10");
-  asm ("movaps [rdi+3*16], xmm11");
-  asm ("movaps [rdi+4*16], xmm12");
-  asm ("movaps [rdi+5*16], xmm13");
-  asm ("movaps [rdi+6*16], xmm14");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF1024(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm8 - xmm15 (Q = message) */
-  asm ("movaps xmm8,  [rsi+0*16]");
-  asm ("movaps xmm9,  [rsi+1*16]");
-  asm ("movaps xmm10, [rsi+2*16]");
-  asm ("movaps xmm11, [rsi+3*16]");
-  asm ("movaps xmm12, [rsi+4*16]");
-  asm ("movaps xmm13, [rsi+5*16]");
-  asm ("movaps xmm14, [rsi+6*16]");
-  asm ("movaps xmm15, [rsi+7*16]");
-
-  /* transform message M from column ordering into row ordering */
-  VPERM_Transform_State( 8,  9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-
-  /* store message M (Q input) for later */
-  asm ("movaps [QTEMP+0*16], xmm8");
-  asm ("movaps [QTEMP+1*16], xmm9");
-  asm ("movaps [QTEMP+2*16], xmm10");
-  asm ("movaps [QTEMP+3*16], xmm11");
-  asm ("movaps [QTEMP+4*16], xmm12");
-  asm ("movaps [QTEMP+5*16], xmm13");
-  asm ("movaps [QTEMP+6*16], xmm14");
-  asm ("movaps [QTEMP+7*16], xmm15");
-
-  /* xor CV to message to get P input */
-  /* result: CV+M in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* compute permutation P */
-  /* result: P(CV+M) in xmm8...xmm15 */
-  ROUNDS_P();
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV+M)+CV in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* store P(CV+M)+CV */
-  asm ("movaps [rdi+0*16], xmm8");
-  asm ("movaps [rdi+1*16], xmm9");
-  asm ("movaps [rdi+2*16], xmm10");
-  asm ("movaps [rdi+3*16], xmm11");
-  asm ("movaps [rdi+4*16], xmm12");
-  asm ("movaps [rdi+5*16], xmm13");
-  asm ("movaps [rdi+6*16], xmm14");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  /* load message M (Q input) into xmm8-15 */
-  asm ("movaps xmm8,  [QTEMP+0*16]");
-  asm ("movaps xmm9,  [QTEMP+1*16]");
-  asm ("movaps xmm10, [QTEMP+2*16]");
-  asm ("movaps xmm11, [QTEMP+3*16]");
-  asm ("movaps xmm12, [QTEMP+4*16]");
-  asm ("movaps xmm13, [QTEMP+5*16]");
-  asm ("movaps xmm14, [QTEMP+6*16]");
-  asm ("movaps xmm15, [QTEMP+7*16]");
-
-  /* compute permutation Q */
-  /* result: Q(M) in xmm8...xmm15 */
-  ROUNDS_Q();
-
-  /* xor Q output */
-  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm8");
-  asm ("movaps [rdi+1*16], xmm9");
-  asm ("movaps [rdi+2*16], xmm10");
-  asm ("movaps [rdi+3*16], xmm11");
-  asm ("movaps [rdi+4*16], xmm12");
-  asm ("movaps [rdi+5*16], xmm13");
-  asm ("movaps [rdi+6*16], xmm14");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF1024(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8 - xmm15 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm9,  [rdi+1*16]");
-  asm ("movaps xmm10, [rdi+2*16]");
-  asm ("movaps xmm11, [rdi+3*16]");
-  asm ("movaps xmm12, [rdi+4*16]");
-  asm ("movaps xmm13, [rdi+5*16]");
-  asm ("movaps xmm14, [rdi+6*16]");
-  asm ("movaps xmm15, [rdi+7*16]");
-
-  /* compute permutation P */
-  /* result: P(CV) in xmm8...xmm15 */
-  ROUNDS_P();
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8...xmm15 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm9,  [rdi+1*16]");
-  asm ("pxor xmm10, [rdi+2*16]");
-  asm ("pxor xmm11, [rdi+3*16]");
-  asm ("pxor xmm12, [rdi+4*16]");
-  asm ("pxor xmm13, [rdi+5*16]");
-  asm ("pxor xmm14, [rdi+6*16]");
-  asm ("pxor xmm15, [rdi+7*16]");
-
-  /* transpose CV back from row ordering to column ordering */
-  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
-  Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7);
-  VPERM_Transform_State( 0, 6, 13, 15, VPERM_OPT, 1, 2, 3, 5, 7, 10, 12);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+4*16], xmm0");
-  asm ("movaps [rdi+5*16], xmm6");
-  asm ("movaps [rdi+6*16], xmm13");
-  asm ("movaps [rdi+7*16], xmm15");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-#endif
-
diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h
index 10092da..3c3e740 100644
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -11,17 +11,6 @@
 #include <wmmintrin.h>
 #include "hash-groestl.h"
 
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-//__m128i ROUND_CONST_L0[ROUNDS512];
-//__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
-
 #define tos(a)    #a
 #define tostr(a)  tos(a)
 
@@ -111,7 +100,7 @@ __m128i ALL_FF;
   \
   /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
   /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
   MUL2(a0, b0, b1);\
   a0 = _mm_xor_si128(a0, TEMP0);\
   MUL2(a1, b0, b1);\
@@ -152,24 +141,41 @@ __m128i ALL_FF;
 }/*MixBytes*/
 
 
-#define SET_CONSTANTS(){\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
-  }\
-}while(0);\
+static const uint64_t round_const_p[] __attribute__ ((aligned (64))) =
+{
+  0x7060504030201000, 0xf0e0d0c0b0a09080,
+  0x7161514131211101, 0xf1e1d1c1b1a19181,
+  0x7262524232221202, 0xf2e2d2c2b2a29282,
+  0x7363534333231303, 0xf3e3d3c3b3a39383,
+  0x7464544434241404, 0xf4e4d4c4b4a49484,
+  0x7565554535251505, 0xf5e5d5c5b5a59585,
+  0x7666564636261606, 0xf6e6d6c6b6a69686,
+  0x7767574737271707, 0xf7e7d7c7b7a79787,
+  0x7868584838281808, 0xf8e8d8c8b8a89888,
+  0x7969594939291909, 0xf9e9d9c9b9a99989,
+  0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a,
+  0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b,
+  0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c,
+  0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d
+};
+
+static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
+{
+  0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f,
+  0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e,
+  0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d,
+  0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c,
+  0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b,
+  0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a,
+  0x8999a9b9c9d9e9f9, 0x0919293949596979,
+  0x8898a8b8c8d8e8f8, 0x0818283848586878,
+  0x8797a7b7c7d7e7f7, 0x0717273747576777,
+  0x8696a6b6c6d6e6f6, 0x0616263646566676,
+  0x8595a5b5c5d5e5f5, 0x0515253545556575,
+  0x8494a4b4c4d4e4f4, 0x0414243444546474,
+  0x8393a3b3c3d3e3f3, 0x0313233343536373,
+  0x8292a2b2c2d2e2f2, 0x0212223242526272
+};
 
 /* one round
  * a0-a7 = input rows
@@ -194,30 +200,50 @@ __m128i ALL_FF;
   u8 round_counter = 0;\
   for(round_counter = 0; round_counter < 14; round_counter+=2) {\
     /* AddRoundConstant P1024 */\
-    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
+    xmm8 = _mm_xor_si128( xmm8, \
+             casti_m128i( round_const_p, round_counter ) ); \
      /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
-    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
+    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x0306090c0f020508, \
+                                                    0x0b0e0104070a0d00 ) ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x04070a0d00030609, \
+                                                    0x0c0f0205080b0e01 ) ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x05080b0e0104070a, \
+                                                    0x0d000306090c0f02 ) ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x06090c0f0205080b, \
+                                                    0x0e0104070a0d0003 ) ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x070a0d000306090c, \
+                                                    0x0f0205080b0e0104 ) ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x080b0e0104070a0d, \
+                                                    0x000306090c0f0205 ) ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x090c0f0205080b0e, \
+                                                    0x0104070a0d000306 ) ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x0e0104070a0d0003, \
+                                                    0x06090c0f0205080b ) ); \
     /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
+            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
     \
     /* AddRoundConstant P1024 */\
-    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    xmm0 = _mm_xor_si128( xmm0, \
+             casti_m128i( round_const_p, round_counter+1 ) ); \
+    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x0306090c0f020508, \
+                                                  0x0b0e0104070a0d00 ) ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x04070a0d00030609, \
+                                                  0x0c0f0205080b0e01 ) ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x05080b0e0104070a, \
+                                                  0x0d000306090c0f02 ) ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x06090c0f0205080b, \
+                                                  0x0e0104070a0d0003 ) ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x070a0d000306090c, \
+                                                  0x0f0205080b0e0104 ) ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x080b0e0104070a0d, \
+                                                  0x000306090c0f0205 ) ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x090c0f0205080b0e, \
+                                                  0x0104070a0d000306 ) ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x0e0104070a0d0003, \
+                                                  0x06090c0f0205080b ) ); \
+    SUBMIX( xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
+            xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
   }\
 }
 
@@ -225,48 +251,68 @@ __m128i ALL_FF;
   u8 round_counter = 0;\
   for(round_counter = 0; round_counter < 14; round_counter+=2) {\
     /* AddRoundConstant Q1024 */\
-    xmm1 = ALL_FF;\
-    xmm8  = _mm_xor_si128(xmm8,  xmm1);\
-    xmm9  = _mm_xor_si128(xmm9,  xmm1);\
-    xmm10 = _mm_xor_si128(xmm10, xmm1);\
-    xmm11 = _mm_xor_si128(xmm11, xmm1);\
-    xmm12 = _mm_xor_si128(xmm12, xmm1);\
-    xmm13 = _mm_xor_si128(xmm13, xmm1);\
-    xmm14 = _mm_xor_si128(xmm14, xmm1);\
-    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
+    xmm1 = m128_neg1;\
+    xmm8  = _mm_xor_si128( xmm8,  xmm1 ); \
+    xmm9  = _mm_xor_si128( xmm9,  xmm1 ); \
+    xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
+    xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
+    xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
+    xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
+    xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
+    xmm15 = _mm_xor_si128( xmm15, \
+              casti_m128i( round_const_q, round_counter ) ); \
     /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[1]));\
-    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[3]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
+    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x04070a0d00030609, \
+                                                    0x0c0f0205080b0e01 ) ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x06090c0f0205080b, \
+                                                    0x0e0104070a0d0003 ) ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x080b0e0104070a0d, \
+                                                    0x000306090c0f0205 ) ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x0e0104070a0d0003, \
+                                                    0x06090c0f0205080b ) ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x0306090c0f020508, \
+                                                    0x0b0e0104070a0d00 ) ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x05080b0e0104070a, \
+                                                    0x0d000306090c0f02 ) ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x070a0d000306090c, \
+                                                    0x0f0205080b0e0104 ) ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x090c0f0205080b0e, \
+                                                    0x0104070a0d000306 ) ); \
     /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
+            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6 , xmm7 ); \
     \
     /* AddRoundConstant Q1024 */\
-    xmm9 = ALL_FF;\
-    xmm0 = _mm_xor_si128(xmm0,  xmm9);\
-    xmm1 = _mm_xor_si128(xmm1,  xmm9);\
-    xmm2 = _mm_xor_si128(xmm2,  xmm9);\
-    xmm3 = _mm_xor_si128(xmm3,  xmm9);\
-    xmm4 = _mm_xor_si128(xmm4,  xmm9);\
-    xmm5 = _mm_xor_si128(xmm5,  xmm9);\
-    xmm6 = _mm_xor_si128(xmm6,  xmm9);\
-    xmm7 = _mm_xor_si128(xmm7,  (ROUND_CONST_Q[round_counter+1]));\
+    xmm9 = m128_neg1;\
+    xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
+    xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
+    xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
+    xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
+    xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
+    xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
+    xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
+    xmm7 = _mm_xor_si128( xmm7, \
+             casti_m128i( round_const_q, round_counter+1 ) ); \
     /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
+    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x04070a0d00030609, \
+                                                  0x0c0f0205080b0e01 ) ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x06090c0f0205080b, \
+                                                  0x0e0104070a0d0003 ) ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x080b0e0104070a0d, \
+                                                  0x000306090c0f0205 ) ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x0e0104070a0d0003, \
+                                                  0x06090c0f0205080b ) ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x0306090c0f020508, \
+                                                  0x0b0e0104070a0d00 ) ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x05080b0e0104070a, \
+                                                  0x0d000306090c0f02 ) ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x070a0d000306090c, \
+                                                  0x0f0205080b0e0104 ) ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x090c0f0205080b0e, \
+                                                  0x0104070a0d000306 ) ); \
     /* SubBytes + MixBytes */\
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    SUBMIX( xmm0,  xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
+            xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
   }\
 }
 
@@ -278,7 +324,7 @@ __m128i ALL_FF;
  * clobbers: t0-t7
  */
 #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = TRANSP_MASK;\
+  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 );\
 \
   i6 = _mm_shuffle_epi8(i6, t0);\
   i0 = _mm_shuffle_epi8(i0, t0);\
@@ -366,7 +412,7 @@ __m128i ALL_FF;
   i4 = _mm_unpacklo_epi64(i4, i5);\
   t1 = _mm_unpackhi_epi64(t1, i5);\
   t2 = i6;\
-  o0 = TRANSP_MASK;\
+  o0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
   i6 = _mm_unpacklo_epi64(i6, i7);\
   t2 = _mm_unpackhi_epi64(t2, i7);\
   /* load transpose mask into a register, because it will be used 8 times */\
diff --git a/algo/groestl/aes_ni/groestl-intr-avx.h b/algo/groestl/aes_ni/groestl-intr-avx.h
deleted file mode 100644
index 97f08dd..0000000
--- a/algo/groestl/aes_ni/groestl-intr-avx.h
+++ /dev/null
@@ -1,1072 +0,0 @@
-/* groestl-intr-avx.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-#include "hash-groestl.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_FF;
-#if LENGTH <= 256
-__m128i ALL_1B;
-#else
-__m256d ALL_1B;
-#endif
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
-#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
-
-#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  j = _mm_cmpgt_epi8(z, i);\
-  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  b0 = a2;\
-  b1 = a3;\
-  b2 = a4;\
-  b3 = a5;\
-  b4 = a6;\
-  b5 = a7;\
-  b6 = a0;\
-  b7 = a1;\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  a0 = _mm_xor_si128(a0, a1);\
-  a1 = _mm_xor_si128(a1, a2);\
-  a2 = _mm_xor_si128(a2, a3);\
-  a3 = _mm_xor_si128(a3, a4);\
-  a4 = _mm_xor_si128(a4, a5);\
-  a5 = _mm_xor_si128(a5, a6);\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b3 = _mm_xor_si128(b3, a7);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b5 = _mm_xor_si128(b5, a1);\
-  b6 = _mm_xor_si128(b6, a2);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  b0 = _mm_xor_si128(b0, a6);\
-  b1 = _mm_xor_si128(b1, a7);\
-  b2 = _mm_xor_si128(b2, a0);\
-  b3 = _mm_xor_si128(b3, a1);\
-  b4 = _mm_xor_si128(b4, a2);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  TEMP1 = b1;\
-  TEMP2 = b2;\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b1 = a1;\
-  TEMP3 = a2;\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP3);\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  b1 = ALL_1B;\
-  b2 = _mm_xor_si128(b2, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  a0 = _mm_xor_si128(a0, TEMP0);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
-  a2 = _mm_xor_si128(a2, TEMP2);\
-  a3 = _mm_xor_si128(a3, b3);\
-  a4 = _mm_xor_si128(a4, b4);\
-  a5 = _mm_xor_si128(a5, b5);\
-  a6 = _mm_xor_si128(a6, b6);\
-  a7 = _mm_xor_si128(a7, b7);\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  b0 = _mm_xor_si128(a3, TEMP0);\
-  b1 = _mm_xor_si128(a4, TEMP1);\
-  b2 = _mm_xor_si128(a5, TEMP2);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b6 = _mm_xor_si128(b6, a1);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* Add Round Constant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  \
-  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
-  \
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-  \
-  o1 = _mm_unpackhi_epi16(i0, i1);\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  t0 = _mm_unpackhi_epi16(i2, i3);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  \
-  o2 = _mm_unpackhi_epi32(i0, i2);\
-  o3 = _mm_unpackhi_epi32(o1, t0);\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = _mm_unpackhi_epi64(i0, i4);\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o2 = _mm_unpacklo_epi64(i1, i5);\
-  o3 = _mm_unpackhi_epi64(i1, i5);\
-  o4 = _mm_unpacklo_epi64(i2, i6);\
-  o5 = _mm_unpackhi_epi64(i2, i6);\
-  o6 = _mm_unpacklo_epi64(i3, i7);\
-  o7 = _mm_unpackhi_epi64(i3, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = _mm_unpackhi_epi64(i0, i1);\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o1 = _mm_unpackhi_epi64(i2, i3);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o2 = _mm_unpackhi_epi64(i4, i5);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o3 = _mm_unpackhi_epi64(i6, i7);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = _mm_unpackhi_epi64(i0, t0);\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i3 = _mm_unpackhi_epi64(i2, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i5 = _mm_unpackhi_epi64(i4, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i7 = _mm_unpackhi_epi64(i6, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-void INIT(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
-  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
-  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
-  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
-  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
-  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
-  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-}
-
-#endif
-
-#if (LENGTH > 256)
-
-#define SET_CONSTANTS(){\
-  __m128i xmm0, xmm1;\
-  __m256d ymm0;\
-  xmm0 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  xmm1 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ymm0 = insert_m128i_in_m256d(ymm0, xmm0, 0);\
-  ymm0 = insert_m128i_in_m256d(ymm0, xmm1, 1);\
-  ALL_1B = ymm0;\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
-  }\
-}while(0);
-
-/* AVX MUL2
- * input: i
- * output i = 2 * i
- * */
-#define VMUL2(i){\
-    xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\
-    xmmIL = extract_m128i_from_m256d(i, 0);\
-    xmmIH = extract_m128i_from_m256d(i, 1);\
-    xmmJL = _mm_cmpgt_epi8(xmmZERO, xmmIL);\
-    xmmJH = _mm_cmpgt_epi8(xmmZERO, xmmIH);\
-    xmmIL = _mm_add_epi8(xmmIL, xmmIL);\
-    xmmIH = _mm_add_epi8(xmmIH, xmmIH);\
-    ymmJ = insert_m128i_in_m256d(ymmJ, xmmJL, 0);\
-    ymmJ = insert_m128i_in_m256d(ymmJ, xmmJH, 1);\
-    ymmJ = _mm256_and_pd(ymmJ, ALL_1B);\
-    i = insert_m128i_in_m256d(i, xmmIL, 0);\
-    i = insert_m128i_in_m256d(i, xmmIH, 1);\
-    i = _mm256_xor_pd(i, ymmJ);\
-}/**/
-
-/* AVX SubShift
- * inputs:
- * * i
- * * c0 (must be 0)
- * * ShiftP
- * * ShiftQ
- * output i = S(Shift(i_1, ShiftQ)|Shift(i_0, ShiftP))
- * clobbers: t0
- * */
-#define SubShift(i, ShiftP, ShiftQ){\
-  xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\
-  xmmIL = extract_m128i_from_m256d(i, 0);\
-  xmmIH = extract_m128i_from_m256d(i, 1);\
-  xmmIL = _mm_shuffle_epi8(xmmIL, SUBSH_MASK[ShiftP]);\
-  xmmIH = _mm_shuffle_epi8(xmmIH, SUBSH_MASK[ShiftQ]);\
-  xmmIL = _mm_aesenclast_si128(xmmIL, xmmZERO);\
-  xmmIH = _mm_aesenclast_si128(xmmIH, xmmZERO);\
-  i = insert_m128i_in_m256d(i, xmmIL, 0);\
-  i = insert_m128i_in_m256d(i, xmmIH, 1);\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  b0 = a2;\
-  b1 = a3;\
-  b2 = a4;\
-  b3 = a5;\
-  b4 = a6;\
-  b5 = a7;\
-  b6 = a0;\
-  b7 = a1;\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  a0 = _mm256_xor_pd(a0, a1);\
-  a1 = _mm256_xor_pd(a1, a2);\
-  a2 = _mm256_xor_pd(a2, a3);\
-  a3 = _mm256_xor_pd(a3, a4);\
-  a4 = _mm256_xor_pd(a4, a5);\
-  a5 = _mm256_xor_pd(a5, a6);\
-  a6 = _mm256_xor_pd(a6, a7);\
-  a7 = _mm256_xor_pd(a7, b6);\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm256_xor_pd(b0, a4);\
-  b1 = _mm256_xor_pd(b1, a5);\
-  b2 = _mm256_xor_pd(b2, a6);\
-  b3 = _mm256_xor_pd(b3, a7);\
-  b4 = _mm256_xor_pd(b4, a0);\
-  b5 = _mm256_xor_pd(b5, a1);\
-  b6 = _mm256_xor_pd(b6, a2);\
-  b7 = _mm256_xor_pd(b7, a3);\
-  \
-  b0 = _mm256_xor_pd(b0, a6);\
-  b1 = _mm256_xor_pd(b1, a7);\
-  b2 = _mm256_xor_pd(b2, a0);\
-  b3 = _mm256_xor_pd(b3, a1);\
-  b4 = _mm256_xor_pd(b4, a2);\
-  b5 = _mm256_xor_pd(b5, a3);\
-  b6 = _mm256_xor_pd(b6, a4);\
-  b7 = _mm256_xor_pd(b7, a5);\
-  \
-  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  TEMP1 = b1;\
-  TEMP2 = b2;\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b1 = a1;\
-  TEMP3 = a2;\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm256_xor_pd(a0, a3);\
-  a1 = _mm256_xor_pd(a1, a4);\
-  a2 = _mm256_xor_pd(a2, a5);\
-  a3 = _mm256_xor_pd(a3, a6);\
-  a4 = _mm256_xor_pd(a4, a7);\
-  a5 = _mm256_xor_pd(a5, b0);\
-  a6 = _mm256_xor_pd(a6, b1);\
-  a7 = _mm256_xor_pd(a7, TEMP3);\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  b1 = ALL_1B;\
-  b2 = _mm256_xor_pd(b2, b2);\
-  VMUL2(a7);\
-  VMUL2(a6);\
-  VMUL2(a5);\
-  VMUL2(a4);\
-  VMUL2(a3);\
-  VMUL2(a2);\
-  VMUL2(a1);\
-  VMUL2(a0);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  a0 = _mm256_xor_pd(a0, TEMP0);\
-  a1 = _mm256_xor_pd(a1, TEMP1);\
-  a2 = _mm256_xor_pd(a2, TEMP2);\
-  a3 = _mm256_xor_pd(a3, b3);\
-  a4 = _mm256_xor_pd(a4, b4);\
-  a5 = _mm256_xor_pd(a5, b5);\
-  a6 = _mm256_xor_pd(a6, b6);\
-  a7 = _mm256_xor_pd(a7, b7);\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0);\
-  VMUL2(a1);\
-  VMUL2(a2);\
-  VMUL2(a3);\
-  VMUL2(a4);\
-  VMUL2(a5);\
-  VMUL2(a6);\
-  VMUL2(a7);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  b0 = _mm256_xor_pd(a3, TEMP0);\
-  b1 = _mm256_xor_pd(a4, TEMP1);\
-  b2 = _mm256_xor_pd(a5, TEMP2);\
-  b3 = _mm256_xor_pd(b3, a6);\
-  b4 = _mm256_xor_pd(b4, a7);\
-  b5 = _mm256_xor_pd(b5, a0);\
-  b6 = _mm256_xor_pd(b6, a1);\
-  b7 = _mm256_xor_pd(b7, a2);\
-}/*MixBytes*/
-
-/* one round
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* ShiftBytes + SubBytes */\
-  SubShift(a0, 0, 1);\
-  SubShift(a1, 1, 3);\
-  SubShift(a2, 2, 5);\
-  SubShift(a3, 3, 7);\
-  SubShift(a4, 4, 0);\
-  SubShift(a5, 5, 2);\
-  SubShift(a6, 6, 4);\
-  SubShift(a7, 7, 6);\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-#define ROUNDS_P_Q(){\
-  u8 round_counter = 0;\
-  for(round_counter = 0; round_counter < 14; round_counter++) {\
-    /* AddRoundConstant */\
-    ymm6 = _mm256_xor_pd(ymm6, ymm6);\
-    ymm7 = insert_m128i_in_m256d(ymm6, ROUND_CONST_Q[round_counter], 1);\
-    ymm6 = insert_m128i_in_m256d(ymm6, ALL_FF, 1);\
-    ymm0 = insert_m128i_in_m256d(ymm6, ROUND_CONST_P[round_counter], 0);\
-    ymm0 = _mm256_xor_pd(ymm8, ymm0);\
-    ymm1 = _mm256_xor_pd(ymm9, ymm6);\
-    ymm2 = _mm256_xor_pd(ymm10, ymm6);\
-    ymm3 = _mm256_xor_pd(ymm11, ymm6);\
-    ymm4 = _mm256_xor_pd(ymm12, ymm6);\
-    ymm5 = _mm256_xor_pd(ymm13, ymm6);\
-    ymm6 = _mm256_xor_pd(ymm14, ymm6);\
-    ymm7 = _mm256_xor_pd(ymm15, ymm7);\
-    /* SubBytes + ShiftBytes + MixBytes */\
-    SUBSHIFTMIX(ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15);\
-  }\
-}
-
-/* Matrix Transpose
- * input is a 1024-bit state with two columns in one xmm
- * output is a 1024-bit state with two rows in one xmm
- * inputs: i0-i7
- * outputs: i0-i7
- * clobbers: t0-t7
- */
-#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = TRANSP_MASK;\
-  \
-  i6 = _mm_shuffle_epi8(i6, t0);\
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-  i4 = _mm_shuffle_epi8(i4, t0);\
-  i5 = _mm_shuffle_epi8(i5, t0);\
-  i7 = _mm_shuffle_epi8(i7, t0);\
-  \
-  /* continue with unpack */\
-  t0 = _mm_unpackhi_epi16(i0, i1);\
-  t1 = _mm_unpackhi_epi16(i2, i3);\
-  t2 = _mm_unpackhi_epi16(i4, i5);\
-  t3 = _mm_unpackhi_epi16(i6, i7);\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  i4 = _mm_unpacklo_epi16(i4, i5);\
-  i6 = _mm_unpacklo_epi16(i6, i7);\
-  \
-  /* shuffle with immediate */\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  t1 = _mm_shuffle_epi32(t1, 216);\
-  t2 = _mm_shuffle_epi32(t2, 216);\
-  t3 = _mm_shuffle_epi32(t3, 216);\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  i4 = _mm_shuffle_epi32(i4, 216);\
-  i6 = _mm_shuffle_epi32(i6, 216);\
-  \
-  /* continue with unpack */\
-  t4 = _mm_unpackhi_epi32(i0,  i2);\
-  i0 = _mm_unpacklo_epi32(i0,  i2);\
-  t5 = _mm_unpackhi_epi32(t0,  t1);\
-  t0 = _mm_unpacklo_epi32(t0,  t1);\
-  t6 = _mm_unpackhi_epi32(i4,  i6);\
-  i4 = _mm_unpacklo_epi32(i4, i6);\
-  t7 = _mm_unpackhi_epi32(t2,  t3);\
-  t2 = _mm_unpacklo_epi32(t2,  t3);\
-  \
-  /* there are now 2 rows in each xmm */\
-  /* unpack to get 1 row of CV in each xmm */\
-  i1 = _mm_unpackhi_epi64(i0, i4);\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  i2 = _mm_unpacklo_epi64(t0, t2);\
-  i3 = _mm_unpackhi_epi64(t0, t2);\
-  i4 = _mm_unpacklo_epi64(t4, t6);\
-  i5 = _mm_unpackhi_epi64(t4, t6);\
-  i6 = _mm_unpacklo_epi64(t5, t7);\
-  i7 = _mm_unpackhi_epi64(t5, t7);\
-  /* transpose done */\
-}/**/
-
-/* Matrix Transpose Inverse
- * input is a 1024-bit state with two rows in one xmm
- * output is a 1024-bit state with two columns in one xmm
- * inputs: i0-i7
- * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
- * clobbers: t0-t4
- */
-#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
-  o0 = TRANSP_MASK;\
-  /*  transpose matrix to get output format */\
-  o1 = _mm_unpackhi_epi64(i0, i1);\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  t0 = _mm_unpackhi_epi64(i2, i3);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  t1 = _mm_unpackhi_epi64(i4, i5);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  t2 = _mm_unpackhi_epi64(i6, i7);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  /* load transpose mask into a register, because it will be used 8 times */\
-  i0 = _mm_shuffle_epi8(i0, o0);\
-  i2 = _mm_shuffle_epi8(i2, o0);\
-  i4 = _mm_shuffle_epi8(i4, o0);\
-  i6 = _mm_shuffle_epi8(i6, o0);\
-  o1 = _mm_shuffle_epi8(o1, o0);\
-  t0 = _mm_shuffle_epi8(t0, o0);\
-  t1 = _mm_shuffle_epi8(t1, o0);\
-  t2 = _mm_shuffle_epi8(t2, o0);\
-  /* continue with unpack */\
-  t3 = _mm_unpackhi_epi16(i4, i6);\
-  i4 = _mm_unpacklo_epi16(i4, i6);\
-  o0 = _mm_unpackhi_epi16(i0, i2);\
-  i0 = _mm_unpacklo_epi16(i0, i2);\
-  o2 = _mm_unpackhi_epi16(o1, t0);\
-  o1 = _mm_unpacklo_epi16(o1, t0);\
-  t4 = _mm_unpackhi_epi16(t1, t2);\
-  t1 = _mm_unpacklo_epi16(t1, t2);\
-  /* shuffle with immediate */\
-  i4 = _mm_shuffle_epi32(i4, 216);\
-  t3 = _mm_shuffle_epi32(t3, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  o2 = _mm_shuffle_epi32(o2, 216);\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o0 = _mm_shuffle_epi32(o0, 216);\
-  t1 = _mm_shuffle_epi32(t1, 216);\
-  t4 = _mm_shuffle_epi32(t4, 216);\
-  /* continue with unpack */\
-  i1 = _mm_unpackhi_epi32(i0, i4);\
-  i0 = _mm_unpacklo_epi32(i0, i4);\
-  i3 = _mm_unpackhi_epi32(o0, t3);\
-  o0 = _mm_unpacklo_epi32(o0, t3);\
-  i5 = _mm_unpackhi_epi32(o1, t1);\
-  o1 = _mm_unpacklo_epi32(o1, t1);\
-  i7 = _mm_unpackhi_epi32(o2, t4);\
-  o2 = _mm_unpacklo_epi32(o2, t4);\
-  /* transpose done */\
-}/**/
-
-void INIT(u64* h)
-{
-   __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm8 - xmm15 */
-  xmm8 = chaining[0];
-  xmm9 = chaining[1];
-  xmm10 = chaining[2];
-  xmm11 = chaining[3];
-  xmm12 = chaining[4];
-  xmm13 = chaining[5];
-  xmm14 = chaining[6];
-  xmm15 = chaining[7];
-
-  /* transform chaining value from column ordering into row ordering */
-  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-
-  /* store transposed IV */
-  chaining[0] = xmm8;
-  chaining[1] = xmm9;
-  chaining[2] = xmm10;
-  chaining[3] = xmm11;
-  chaining[4] = xmm12;
-  chaining[5] = xmm13;
-  chaining[6] = xmm14;
-  chaining[7] = xmm15;
-}
-
-void TF1024(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO;
-  static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
-  static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
-  static __m256d ymmJ;
-  static __m256d TEMP0;
-  static __m256d TEMP1;
-  static __m256d TEMP2;
-  static __m256d TEMP3;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm8 - xmm15 (Q = message) */
-  xmm0 = message[0];
-  xmm1 = message[1];
-  xmm2 = message[2];
-  xmm3 = message[3];
-  xmm4 = message[4];
-  xmm5 = message[5];
-  xmm6 = message[6];
-  xmm7 = message[7];
-
-  /* transform message M from column ordering into row ordering */
-  Matrix_Transpose(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-   /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-   /* result: CV+M in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm0, chaining[0]);
-  xmm9 = _mm_xor_si128(xmm1, chaining[1]);
-  xmm10 = _mm_xor_si128(xmm2, chaining[2]);
-  xmm11 = _mm_xor_si128(xmm3, chaining[3]);
-  xmm12 = _mm_xor_si128(xmm4, chaining[4]);
-  xmm13 = _mm_xor_si128(xmm5, chaining[5]);
-  xmm14 = _mm_xor_si128(xmm6, chaining[6]);
-  xmm15 = _mm_xor_si128(xmm7, chaining[7]);
-
-  /* generate AVX registers with Q in high and P in low 128 bits */
-  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm8,  0);
-  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm9,  0);
-  ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0);
-  ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0);
-  ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0);
-  ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0);
-  ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0);
-  ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0);
-
-  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm0, 1);
-  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm1, 1);
-  ymm10 = insert_m128i_in_m256d(ymm10, xmm2, 1);
-  ymm11 = insert_m128i_in_m256d(ymm11, xmm3, 1);
-  ymm12 = insert_m128i_in_m256d(ymm12, xmm4, 1);
-  ymm13 = insert_m128i_in_m256d(ymm13, xmm5, 1);
-  ymm14 = insert_m128i_in_m256d(ymm14, xmm6, 1);
-  ymm15 = insert_m128i_in_m256d(ymm15, xmm7, 1);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* extract Q to xmm */
-  xmm0 = extract_m128i_from_m256d(ymm8, 1);
-  xmm1 = extract_m128i_from_m256d(ymm9, 1);
-  xmm2 = extract_m128i_from_m256d(ymm10, 1);
-  xmm3 = extract_m128i_from_m256d(ymm11, 1);
-  xmm4 = extract_m128i_from_m256d(ymm12, 1);
-  xmm5 = extract_m128i_from_m256d(ymm13, 1);
-  xmm6 = extract_m128i_from_m256d(ymm14, 1);
-  xmm7 = extract_m128i_from_m256d(ymm15, 1);
-
-  /* extract P to xmm */
-  xmm8  =  extract_m128i_from_m256d(ymm8, 0);
-  xmm9  =  extract_m128i_from_m256d(ymm9, 0);
-  xmm10 =  extract_m128i_from_m256d(ymm10, 0);
-  xmm11 =  extract_m128i_from_m256d(ymm11, 0);
-  xmm12 =  extract_m128i_from_m256d(ymm12, 0);
-  xmm13 =  extract_m128i_from_m256d(ymm13, 0);
-  xmm14 =  extract_m128i_from_m256d(ymm14, 0);
-  xmm15 =  extract_m128i_from_m256d(ymm15, 0);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  xmm0);
-  xmm9 = _mm_xor_si128(xmm9,  xmm1);
-  xmm10 = _mm_xor_si128(xmm10, xmm2);
-  xmm11 = _mm_xor_si128(xmm11, xmm3);
-  xmm12 = _mm_xor_si128(xmm12, xmm4);
-  xmm13 = _mm_xor_si128(xmm13, xmm5);
-  xmm14 = _mm_xor_si128(xmm14, xmm6);
-  xmm15 = _mm_xor_si128(xmm15, xmm7);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  chaining[0]);
-  xmm9 = _mm_xor_si128(xmm9,  chaining[1]);
-  xmm10 = _mm_xor_si128(xmm10, chaining[2]);
-  xmm11 = _mm_xor_si128(xmm11, chaining[3]);
-  xmm12 = _mm_xor_si128(xmm12, chaining[4]);
-  xmm13 = _mm_xor_si128(xmm13, chaining[5]);
-  xmm14 = _mm_xor_si128(xmm14, chaining[6]);
-  xmm15 = _mm_xor_si128(xmm15, chaining[7]);
-
-  /* store CV */
-  chaining[0] = xmm8;
-  chaining[1] = xmm9;
-  chaining[2] = xmm10;
-  chaining[3] = xmm11;
-  chaining[4] = xmm12;
-  chaining[5] = xmm13;
-  chaining[6] = xmm14;
-  chaining[7] = xmm15;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF1024(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO;
-  static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
-  static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
-  static __m256d ymmJ;
-  static __m256d TEMP0;
-  static __m256d TEMP1;
-  static __m256d TEMP2;
-  static __m256d TEMP3;
-
-  /* load CV into registers xmm8...xmm15 */
-  xmm8 = chaining[0];
-  xmm9 = chaining[1];
-  xmm10 = chaining[2];
-  xmm11 = chaining[3];
-  xmm12 = chaining[4];
-  xmm13 = chaining[5];
-  xmm14 = chaining[6];
-  xmm15 = chaining[7];
-  
-  xmm0 = _mm_xor_si128(xmm0, xmm0);
-
-  /* generate AVX registers with Q in high and P in low 128 bits */
-  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm8,  0);
-  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm9,  0);
-  ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0);
-  ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0);
-  ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0);
-  ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0);
-  ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0);
-  ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0);
-
-  ymm8 =  insert_m128i_in_m256d(ymm8,  xmm0, 1);
-  ymm9 =  insert_m128i_in_m256d(ymm9,  xmm0, 1);
-  ymm10 = insert_m128i_in_m256d(ymm10, xmm0, 1);
-  ymm11 = insert_m128i_in_m256d(ymm11, xmm0, 1);
-  ymm12 = insert_m128i_in_m256d(ymm12, xmm0, 1);
-  ymm13 = insert_m128i_in_m256d(ymm13, xmm0, 1);
-  ymm14 = insert_m128i_in_m256d(ymm14, xmm0, 1);
-  ymm15 = insert_m128i_in_m256d(ymm15, xmm0, 1);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8...xmm15 */
-  ROUNDS_P_Q();
-
-  xmm8  =  extract_m128i_from_m256d(ymm8, 0);
-  xmm9  =  extract_m128i_from_m256d(ymm9, 0);
-  xmm10 =  extract_m128i_from_m256d(ymm10, 0);
-  xmm11 =  extract_m128i_from_m256d(ymm11, 0);
-  xmm12 =  extract_m128i_from_m256d(ymm12, 0);
-  xmm13 =  extract_m128i_from_m256d(ymm13, 0);
-  xmm14 =  extract_m128i_from_m256d(ymm14, 0);
-  xmm15 =  extract_m128i_from_m256d(ymm15, 0);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  chaining[0]);
-  xmm9 = _mm_xor_si128(xmm9,  chaining[1]);
-  xmm10 = _mm_xor_si128(xmm10, chaining[2]);
-  xmm11 = _mm_xor_si128(xmm11, chaining[3]);
-  xmm12 = _mm_xor_si128(xmm12, chaining[4]);
-  xmm13 = _mm_xor_si128(xmm13, chaining[5]);
-  xmm14 = _mm_xor_si128(xmm14, chaining[6]);
-  xmm15 = _mm_xor_si128(xmm15, chaining[7]);
-
-  /* transpose CV back from row ordering to column ordering */
-  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
-  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
-
-  /* we only need to return the truncated half of the state */
-  chaining[0] = xmm8;
-  chaining[1] = xmm4;
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-  chaining[4] = xmm0;
-  chaining[5] = xmm6;
-  chaining[6] = xmm13;
-  chaining[7] = xmm15;
-
-  return;
-}//OF1024()
-
-#endif
-
diff --git a/algo/groestl/aes_ni/groestl-intr-vperm.h b/algo/groestl/aes_ni/groestl-intr-vperm.h
deleted file mode 100644
index c755229..0000000
--- a/algo/groestl/aes_ni/groestl-intr-vperm.h
+++ /dev/null
@@ -1,1294 +0,0 @@
-/* groestl-intr-vperm.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include <tmmintrin.h>
-#include "hash-groestl.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_0F;
-__m128i ALL_15;
-__m128i ALL_1B;
-__m128i ALL_63;
-__m128i ALL_FF;
-__m128i VPERM_IPT[2];
-__m128i VPERM_OPT[2];
-__m128i VPERM_INV[2];
-__m128i VPERM_SB1[2];
-__m128i VPERM_SB2[2];
-__m128i VPERM_SB4[2];
-__m128i VPERM_SBO[2];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
-  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
-  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
-  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
-  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
-  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
-  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
-  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
-  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
-  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
-  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
-  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
-  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
-  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
-  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  t0 = c0;\
-  t1 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t1 = _mm_andnot_si128(t1, a1);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  t1 = _mm_srli_epi32(t1, 4);\
-  a0 = _mm_and_si128(a0, c0);\
-  a1 = _mm_and_si128(a1, c0);\
-  t2 = c2;\
-  t3 = c2;\
-  t2 = _mm_shuffle_epi8(t2, a0);\
-  t3 = _mm_shuffle_epi8(t3, a1);\
-  a0 = c1;\
-  a1 = c1;\
-  a0 = _mm_shuffle_epi8(a0, t0);\
-  a1 = _mm_shuffle_epi8(a1, t1);\
-  a0 = _mm_xor_si128(a0, t2);\
-  a1 = _mm_xor_si128(a1, t3);\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  c0 = ALL_0F;\
-  c1 = ((__m128i*) table )[0];\
-  c2 = ((__m128i*) table )[1];\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  t0 = constant;\
-  a0 = _mm_xor_si128(a0,  t0);\
-  a1 = _mm_xor_si128(a1,  t0);\
-  a2 = _mm_xor_si128(a2,  t0);\
-  a3 = _mm_xor_si128(a3,  t0);\
-  a4 = _mm_xor_si128(a4,  t0);\
-  a5 = _mm_xor_si128(a5,  t0);\
-  a6 = _mm_xor_si128(a6,  t0);\
-  a7 = _mm_xor_si128(a7,  t0);\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  t0 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  a0 = _mm_and_si128(a0,  c0);\
-  b0a = c1;\
-  b0a = _mm_shuffle_epi8(b0a, a0);\
-  a0 = _mm_xor_si128(a0,  t0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t0);\
-  b0b = _mm_xor_si128(b0b, b0a);\
-  t1 = c2;\
-  t1 = _mm_shuffle_epi8(t1,  a0);\
-  t1 = _mm_xor_si128(t1,  b0a);\
-  b0a = c2;\
-  b0a = _mm_shuffle_epi8(b0a, b0b);\
-  b0a = _mm_xor_si128(b0a, a0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t1);\
-  b0b = _mm_xor_si128(b0b, t0);\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  b0 = ((__m128i*) table )[0];\
-  t0 = ((__m128i*) table )[1];\
-  b0 = _mm_shuffle_epi8(b0, a0b);\
-  t0 = _mm_shuffle_epi8(t0, a0a);\
-  b0 = _mm_xor_si128(b0, t0);\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[1] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[1] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[2] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[2] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[3] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[3] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[5] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[5] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[6] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[6] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[7] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[4] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  TEMP_MUL2[0] = c2;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  TEMP_MUL4 = a3;\
-  /* 1 */\
-  b1 = a0;\
-  b1 = _mm_xor_si128(b1, a5);\
-  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
-  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
-  b2 = b1;\
-  \
-  /* 2 */\
-  b5 = a1;\
-  b5 = _mm_xor_si128(b5, a4);\
-  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
-  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
-  b6 = b5;\
-  \
-  /* 4 */\
-  b7 = _mm_xor_si128(b7, a6);\
-  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
-  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
-  b2 = _mm_xor_si128(b2, b7);\
-  \
-  /* 3 */\
-  b0 = _mm_xor_si128(b0, a7);\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
-  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
-  b3 = b0;\
-  b1 = _mm_xor_si128(b1, b0);\
-  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
-  \
-  /* 5 */\
-  b4 = _mm_xor_si128(b4, a2);\
-  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
-  b3 = _mm_xor_si128(b3, b4);\
-  b6 = _mm_xor_si128(b6, b4);\
-  \
-  /* 6 */\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
-  b4 = _mm_xor_si128(b4, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  /* 7 */\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
-  b2 = _mm_xor_si128(b2, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
-  \
-  /* 8 */\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
-  b6 = _mm_xor_si128(b6, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* 9 */\
-  a3 = TEMP_MUL1[2];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
-  b0 = _mm_xor_si128(b0, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 10 */\
-  a1 = TEMP_MUL1[6];\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
-  b1 = _mm_xor_si128(b1, a1);\
-  b4 = _mm_xor_si128(b4, a1);\
-  \
-  /* 11 */\
-  a5 = TEMP_MUL1[3];\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
-  b1 = _mm_xor_si128(b1, a5);\
-  b6 = _mm_xor_si128(b6, a5);\
-  \
-  /* 12 */\
-  a3 = TEMP_MUL1[7];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
-  b2 = _mm_xor_si128(b2, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 13 */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a4);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/**/
-
-#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}/**/
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a4 = _mm_xor_si128(a4, b1);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-\
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-\
-  o1 = i0;\
-  t0 = i2;\
-\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
-\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-\
-  o2 = i0;\
-  o3 = o1;\
-\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = i0;\
-  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
-  o3 = i1;\
-  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
-  o5 = i2;\
-  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
-  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
-  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
-  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
-  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = i0;\
-  i3 = i2;\
-  i5 = i4;\
-  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  xmm0 = ROUND_CONST_L0[i];\
-  xmm1 = ROUND_CONST_L7[i];\
-  xmm2 = ROUND_CONST_L0[j];\
-  xmm3 = ROUND_CONST_L7[j];\
-  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
-  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
-  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
-  ROUND_CONST_L0[i] = xmm0;\
-  ROUND_CONST_L7[i] = xmm1;\
-  ROUND_CONST_L0[j] = xmm2;\
-  ROUND_CONST_L7[j] = xmm3;\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  xmm0 = ROUND_CONST_Lx;\
-  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  ROUND_CONST_Lx = xmm0;\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  xmm8 = chaining[0];
-  xmm0 = chaining[1];
-  xmm4 = chaining[2];
-  xmm5 = chaining[3];
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-
-  return;
-}//OF512()
-
-#endif
-
-#if (LENGTH > 256)
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
-  }\
-}/**/
-
-/* one round
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* SubBytes + Multiplication */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-#define ROUNDS_P(){\
-  u8 round_counter = 0;\
-  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
-    /* AddRoundConstant P1024 */\
-    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8 = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
-    xmm9 = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
-    /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-    VPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ALL_15, xmm8);\
-    \
-    /* AddRoundConstant P1024 */\
-    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
-    /* SubBytes + MixBytes */\
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-    VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-  }\
-}/**/
-
-#define ROUNDS_Q(){\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\
-  u8 round_counter = 0;\
-  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
-    /* AddRoundConstant Q1024 */\
-    xmm1 = ALL_FF;\
-    xmm8 = _mm_xor_si128(xmm8, xmm1);\
-    xmm9 = _mm_xor_si128(xmm9, xmm1);\
-    xmm10 = _mm_xor_si128(xmm10, xmm1);\
-    xmm11 = _mm_xor_si128(xmm11, xmm1);\
-    xmm12 = _mm_xor_si128(xmm12, xmm1);\
-    xmm13 = _mm_xor_si128(xmm13, xmm1);\
-    xmm14 = _mm_xor_si128(xmm14, xmm1);\
-    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
-    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\
-    xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
-    /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-    \
-    /* AddRoundConstant Q1024 */\
-    xmm9 = ALL_FF;\
-    xmm0 = _mm_xor_si128(xmm0, xmm9);\
-    xmm1 = _mm_xor_si128(xmm1, xmm9);\
-    xmm2 = _mm_xor_si128(xmm2, xmm9);\
-    xmm3 = _mm_xor_si128(xmm3, xmm9);\
-    xmm4 = _mm_xor_si128(xmm4, xmm9);\
-    xmm5 = _mm_xor_si128(xmm5, xmm9);\
-    xmm6 = _mm_xor_si128(xmm6, xmm9);\
-    xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\
-    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
-    /* SubBytes + MixBytes*/ \
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  }\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\
-}/**/
-
-
-/* Matrix Transpose
- * input is a 1024-bit state with two columns in one xmm
- * output is a 1024-bit state with two rows in one xmm
- * inputs: i0-i7
- * outputs: i0-i7
- * clobbers: t0-t7
- */
-#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = TRANSP_MASK;\
-\
-  i6 = _mm_shuffle_epi8(i6, t0);\
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-  t1 = i2;\
-  i4 = _mm_shuffle_epi8(i4, t0);\
-  i5 = _mm_shuffle_epi8(i5, t0);\
-  t2 = i4;\
-  t3 = i6;\
-  i7 = _mm_shuffle_epi8(i7, t0);\
-\
-  /* continue with unpack using 4 temp registers */\
-  t0 = i0;\
-  t2 = _mm_unpackhi_epi16(t2, i5);\
-  i4 = _mm_unpacklo_epi16(i4, i5);\
-  t3 = _mm_unpackhi_epi16(t3, i7);\
-  i6 = _mm_unpacklo_epi16(i6, i7);\
-  t0 = _mm_unpackhi_epi16(t0, i1);\
-  t1 = _mm_unpackhi_epi16(t1, i3);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-\
-  /* shuffle with immediate */\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  t1 = _mm_shuffle_epi32(t1, 216);\
-  t2 = _mm_shuffle_epi32(t2, 216);\
-  t3 = _mm_shuffle_epi32(t3, 216);\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  i4 = _mm_shuffle_epi32(i4, 216);\
-  i6 = _mm_shuffle_epi32(i6, 216);\
-\
-  /* continue with unpack */\
-  t4 = i0;\
-  i0 = _mm_unpacklo_epi32(i0,  i2);\
-  t4 = _mm_unpackhi_epi32(t4,  i2);\
-  t5 = t0;\
-  t0 = _mm_unpacklo_epi32(t0,  t1);\
-  t5 = _mm_unpackhi_epi32(t5,  t1);\
-  t6 = i4;\
-  i4 = _mm_unpacklo_epi32(i4, i6);\
-  t7 = t2;\
-  t6 = _mm_unpackhi_epi32(t6,  i6);\
-  i2 = t0;\
-  t2 = _mm_unpacklo_epi32(t2,  t3);\
-  i3 = t0;\
-  t7 = _mm_unpackhi_epi32(t7,  t3);\
-\
-  /* there are now 2 rows in each xmm */\
-  /* unpack to get 1 row of CV in each xmm */\
-  i1 = i0;\
-  i1 = _mm_unpackhi_epi64(i1, i4);\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  i4 = t4;\
-  i3 = _mm_unpackhi_epi64(i3, t2);\
-  i5 = t4;\
-  i2 = _mm_unpacklo_epi64(i2, t2);\
-  i6 = t5;\
-  i5 = _mm_unpackhi_epi64(i5, t6);\
-  i7 = t5;\
-  i4 = _mm_unpacklo_epi64(i4, t6);\
-  i7 = _mm_unpackhi_epi64(i7, t7);\
-  i6 = _mm_unpacklo_epi64(i6, t7);\
-  /* transpose done */\
-}/**/
-
-/* Matrix Transpose Inverse
- * input is a 1024-bit state with two rows in one xmm
- * output is a 1024-bit state with two columns in one xmm
- * inputs: i0-i7
- * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
- * clobbers: t0-t4
- */
-#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
-  /*  transpose matrix to get output format */\
-  o1 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o1 = _mm_unpackhi_epi64(o1, i1);\
-  t0 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  t0 = _mm_unpackhi_epi64(t0, i3);\
-  t1 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  t1 = _mm_unpackhi_epi64(t1, i5);\
-  t2 = i6;\
-  o0 = TRANSP_MASK;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  t2 = _mm_unpackhi_epi64(t2, i7);\
-  /* load transpose mask into a register, because it will be used 8 times */\
-  i0 = _mm_shuffle_epi8(i0, o0);\
-  i2 = _mm_shuffle_epi8(i2, o0);\
-  i4 = _mm_shuffle_epi8(i4, o0);\
-  i6 = _mm_shuffle_epi8(i6, o0);\
-  o1 = _mm_shuffle_epi8(o1, o0);\
-  t0 = _mm_shuffle_epi8(t0, o0);\
-  t1 = _mm_shuffle_epi8(t1, o0);\
-  t2 = _mm_shuffle_epi8(t2, o0);\
-  /* continue with unpack using 4 temp registers */\
-  t3 = i4;\
-  o2 = o1;\
-  o0 = i0;\
-  t4 = t1;\
-  \
-  t3 = _mm_unpackhi_epi16(t3, i6);\
-  i4 = _mm_unpacklo_epi16(i4, i6);\
-  o0 = _mm_unpackhi_epi16(o0, i2);\
-  i0 = _mm_unpacklo_epi16(i0, i2);\
-  o2 = _mm_unpackhi_epi16(o2, t0);\
-  o1 = _mm_unpacklo_epi16(o1, t0);\
-  t4 = _mm_unpackhi_epi16(t4, t2);\
-  t1 = _mm_unpacklo_epi16(t1, t2);\
-  /* shuffle with immediate */\
-  i4 = _mm_shuffle_epi32(i4, 216);\
-  t3 = _mm_shuffle_epi32(t3, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  o2 = _mm_shuffle_epi32(o2, 216);\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o0 = _mm_shuffle_epi32(o0, 216);\
-  t1 = _mm_shuffle_epi32(t1, 216);\
-  t4 = _mm_shuffle_epi32(t4, 216);\
-  /* continue with unpack */\
-  i1 = i0;\
-  i3 = o0;\
-  i5 = o1;\
-  i7 = o2;\
-  i0 = _mm_unpacklo_epi32(i0, i4);\
-  i1 = _mm_unpackhi_epi32(i1, i4);\
-  o0 = _mm_unpacklo_epi32(o0, t3);\
-  i3 = _mm_unpackhi_epi32(i3, t3);\
-  o1 = _mm_unpacklo_epi32(o1, t1);\
-  i5 = _mm_unpackhi_epi32(i5, t1);\
-  o2 = _mm_unpacklo_epi32(o2, t4);\
-  i7 = _mm_unpackhi_epi32(i7, t4);\
-  /* transpose done */\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  xmm0 = ROUND_CONST_P[i];\
-  xmm1 = ROUND_CONST_P[j];\
-  xmm2 = ROUND_CONST_Q[i];\
-  xmm3 = ROUND_CONST_Q[j];\
-  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
-  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
-  ROUND_CONST_P[i] = xmm0;\
-  ROUND_CONST_P[j] = xmm1;\
-  ROUND_CONST_Q[i] = xmm2;\
-  ROUND_CONST_Q[j] = xmm3;\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-  VPERM_Transform_RoundConst_CNT2(10, 11);\
-  VPERM_Transform_RoundConst_CNT2(12, 13);\
-  xmm0 = ALL_FF;\
-  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  ALL_FF = xmm0;\
-}/**/
-
-
-void INIT(u64* h)
-{
-   __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm8 - xmm15 */
-  xmm8 = chaining[0];
-  xmm9 = chaining[1];
-  xmm10 = chaining[2];
-  xmm11 = chaining[3];
-  xmm12 = chaining[4];
-  xmm13 = chaining[5];
-  xmm14 = chaining[6];
-  xmm15 = chaining[7];
-
-  /* transform chaining value from column ordering into row ordering */
-  VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-
-  /* store transposed IV */
-  chaining[0] = xmm8;
-  chaining[1] = xmm9;
-  chaining[2] = xmm10;
-  chaining[3] = xmm11;
-  chaining[4] = xmm12;
-  chaining[5] = xmm13;
-  chaining[6] = xmm14;
-  chaining[7] = xmm15;
-}
-
-void TF1024(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-  static __m128i QTEMP[8];
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm8 - xmm15 (Q = message) */
-  xmm8 = message[0];
-  xmm9 = message[1];
-  xmm10 = message[2];
-  xmm11 = message[3];
-  xmm12 = message[4];
-  xmm13 = message[5];
-  xmm14 = message[6];
-  xmm15 = message[7];
-
-  /* transform message M from column ordering into row ordering */
-  VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-
-  /* store message M (Q input) for later */
-  QTEMP[0] = xmm8;
-  QTEMP[1] = xmm9;
-  QTEMP[2] = xmm10;
-  QTEMP[3] = xmm11;
-  QTEMP[4] = xmm12;
-  QTEMP[5] = xmm13;
-  QTEMP[6] = xmm14;
-  QTEMP[7] = xmm15;
-
-  /* xor CV to message to get P input */
-  /* result: CV+M in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
-
-  /* compute permutation P */
-  /* result: P(CV+M) in xmm8...xmm15 */
-  ROUNDS_P();
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV+M)+CV in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
-
-  /* store P(CV+M)+CV */
-  chaining[0] = xmm8;
-  chaining[1] = xmm9;
-  chaining[2] = xmm10;
-  chaining[3] = xmm11;
-  chaining[4] = xmm12;
-  chaining[5] = xmm13;
-  chaining[6] = xmm14;
-  chaining[7] = xmm15;
-
-  /* load message M (Q input) into xmm8-15 */
-  xmm8 = QTEMP[0];
-  xmm9 = QTEMP[1];
-  xmm10 = QTEMP[2];
-  xmm11 = QTEMP[3];
-  xmm12 = QTEMP[4];
-  xmm13 = QTEMP[5];
-  xmm14 = QTEMP[6];
-  xmm15 = QTEMP[7];
-
-  /* compute permutation Q */
-  /* result: Q(M) in xmm8...xmm15 */
-  ROUNDS_Q();
-
-  /* xor Q output */
-  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
-
-  /* store CV */
-  chaining[0] = xmm8;
-  chaining[1] = xmm9;
-  chaining[2] = xmm10;
-  chaining[3] = xmm11;
-  chaining[4] = xmm12;
-  chaining[5] = xmm13;
-  chaining[6] = xmm14;
-  chaining[7] = xmm15;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF1024(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-  /* load CV into registers xmm8 - xmm15 */
-  xmm8 = chaining[0];
-  xmm9 = chaining[1];
-  xmm10 = chaining[2];
-  xmm11 = chaining[3];
-  xmm12 = chaining[4];
-  xmm13 = chaining[5];
-  xmm14 = chaining[6];
-  xmm15 = chaining[7];
-
-  /* compute permutation P */
-  /* result: P(CV) in xmm8...xmm15 */
-  ROUNDS_P();
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
-
-  /* transpose CV back from row ordering to column ordering */
-  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
-  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
-  VPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, VPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12);
-
-  /* we only need to return the truncated half of the state */
-  chaining[4] = xmm0;
-  chaining[5] = xmm6;
-  chaining[6] = xmm13;
-  chaining[7] = xmm15;
-
-  return;
-}
-
-#endif
-
diff --git a/algo/groestl/aes_ni/groestl-version.h b/algo/groestl/aes_ni/groestl-version.h
deleted file mode 100644
index 26736ec..0000000
--- a/algo/groestl/aes_ni/groestl-version.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// specify assembly or intrinsics implementation
-//#define TASM
-#define TINTR
-
-// Not to be confused with AVX512VAES
-#define VAES
-// #define VAVX
-// #define VVPERM
-
-//#endif
diff --git a/algo/groestl/aes_ni/groestl256-asm-aes.h b/algo/groestl/aes_ni/groestl256-asm-aes.h
deleted file mode 100644
index 0810b5e..0000000
--- a/algo/groestl/aes_ni/groestl256-asm-aes.h
+++ /dev/null
@@ -1,529 +0,0 @@
-/* groestl-asm-aes.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
- * instructions.
- * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b */
-#define MUL2(i, j, k){\
-  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
-  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
-  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* t_i = a_i + a_{i+1} */\
-  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
-  /* spill values y_4, y_5 to memory */\
-  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
-  \
-  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  /* compute w_i : add y_{i+4} */\
-  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
-  MUL2(a3, b0, b1);\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
-  MUL2(a4, b0, b1);\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /* compute v_i : double w_i      */\
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  MUL2(a3, b0, b1);\
-  MUL2(a4, b0, b1);\
-  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
-  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-}/*MixBytes*/
-
-#define SET_CONSTANTS(){\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-  \
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-  \
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-  \
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-  \
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-  \
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-  \
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
diff --git a/algo/groestl/aes_ni/groestl256-asm-avx.h b/algo/groestl/aes_ni/groestl256-asm-avx.h
deleted file mode 100644
index e7cb4c7..0000000
--- a/algo/groestl/aes_ni/groestl256-asm-avx.h
+++ /dev/null
@@ -1,519 +0,0 @@
-/* groestl-asm-avx.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-
-/* global variables  */
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
-__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
-
-/* temporary variables  */
-__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2v2(i, j, k, z){\
-  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* spill values y_4, y_5 to memory */\
-  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
-
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-\
-  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("vmovaps xmm12, [rdi+0*16]");
-  asm ("vmovaps xmm13, [rdi+1*16]");
-  asm ("vmovaps xmm14, [rdi+2*16]");
-  asm ("vmovaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("vmovaps [rdi+0*16], xmm12");
-  asm ("vmovaps [rdi+1*16], xmm2");
-  asm ("vmovaps [rdi+2*16], xmm6");
-  asm ("vmovaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("vmovaps xmm12, [rsi+0*16]");
-  asm ("vmovaps xmm13, [rsi+1*16]");
-  asm ("vmovaps xmm14, [rsi+2*16]");
-  asm ("vmovaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
-  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
-  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
-  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, xmm8");
-  asm ("vpxor xmm1, xmm1, xmm10");
-  asm ("vpxor xmm2, xmm2, xmm12");
-  asm ("vpxor xmm3, xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
-  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
-  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
-  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("vmovaps [rdi+0*16], xmm0");
-  asm ("vmovaps [rdi+1*16], xmm1");
-  asm ("vmovaps [rdi+2*16], xmm2");
-  asm ("vmovaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("vmovaps xmm8,  [rdi+0*16]");
-  asm ("vmovaps xmm10, [rdi+1*16]");
-  asm ("vmovaps xmm12, [rdi+2*16]");
-  asm ("vmovaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
-  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
-  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
-  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("vmovaps [rdi+2*16], xmm9");
-  asm ("vmovaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
diff --git a/algo/groestl/aes_ni/groestl256-asm-vperm.h b/algo/groestl/aes_ni/groestl256-asm-vperm.h
deleted file mode 100644
index a25ade7..0000000
--- a/algo/groestl/aes_ni/groestl256-asm-vperm.h
+++ /dev/null
@@ -1,856 +0,0 @@
-/* groestl-asm-vperm.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
-__attribute__ ((aligned (16))) unsigned char ALL_15[16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_63[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
-  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
-  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
-  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
-  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
-  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
-  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
-  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
-  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
-  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
-  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
-  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
-  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
-  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
-  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
-  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
-  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
-  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
-  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
-  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
-  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
-  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
-  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
-  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
-  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
-  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
-/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
-  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
-  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
-  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
-  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
-  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
-  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
-  asm ("psrld  xmm"tostr(t0)", 4");\
-  asm ("psrld  xmm"tostr(t1)", 4");\
-  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
-  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
-  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
-  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
-  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
-  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
-  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
-  asm ("psrld  xmm"tostr(t0)",  4");\
-  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
-  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
-  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
-  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
-  /* 1 */\
-  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
-  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
-  \
-  /* 2 */\
-  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
-  \
-  /* 4 */\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
-  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
-  \
-  /* 3 */\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
-  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
-  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
-  \
-  /* 5 */\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
-  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
-  \
-  /* 6 */\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  /* 7 */\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  \
-  /* 8 */\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
-  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* 9 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 10 */\
-  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
-  \
-  /* 11 */\
-  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
-  \
-  /* 12 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 13 */\
-  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
-  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/**/
-
-//#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}/**/
-
-#define Push_All_Regs(){\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}/**/
-
-#define Pop_All_Regs(){\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}/**/
-
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-\
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
-  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
-  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("pxor xmm1, [ALL_15]");\
-  asm ("pxor xmm2, [ALL_15]");\
-  asm ("pxor xmm3, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
-  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_Lx], xmm0");\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-
diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h
index 57dd930..15517cf 100644
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -11,18 +11,6 @@
 #include <wmmintrin.h>
 #include "hash-groestl256.h"
 
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-//__m128i ROUND_CONST_P[ROUNDS1024];
-//__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
-
-
 #define tos(a)    #a
 #define tostr(a)  tos(a)
 
@@ -113,7 +101,7 @@ __m128i ALL_FF;
   \
   /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
   /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
   MUL2(a0, b0, b1);\
   a0 = _mm_xor_si128(a0, TEMP0);\
   MUL2(a1, b0, b1);\
@@ -153,24 +141,35 @@ __m128i ALL_FF;
   b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/
 
-#define SET_CONSTANTS(){\
-   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0); \
+
+static const uint64_t round_const_l0[] __attribute__ ((aligned (64))) =
+{
+  0x7060504030201000, 0xffffffffffffffff,
+  0x7161514131211101, 0xffffffffffffffff,
+  0x7262524232221202, 0xffffffffffffffff,
+  0x7363534333231303, 0xffffffffffffffff,
+  0x7464544434241404, 0xffffffffffffffff,
+  0x7565554535251505, 0xffffffffffffffff,
+  0x7666564636261606, 0xffffffffffffffff,
+  0x7767574737271707, 0xffffffffffffffff,
+  0x7868584838281808, 0xffffffffffffffff,
+  0x7969594939291909, 0xffffffffffffffff
+};
+
+static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
+{
+0x0000000000000000, 0x8f9fafbfcfdfefff,
+0x0000000000000000, 0x8e9eaebecedeeefe,
+0x0000000000000000, 0x8d9dadbdcdddedfd,
+0x0000000000000000, 0x8c9cacbcccdcecfc,
+0x0000000000000000, 0x8b9babbbcbdbebfb,
+0x0000000000000000, 0x8a9aaabacadaeafa,
+0x0000000000000000, 0x8999a9b9c9d9e9f9,
+0x0000000000000000, 0x8898a8b8c8d8e8f8,
+0x0000000000000000, 0x8797a7b7c7d7e7f7,
+0x0000000000000000, 0x8696a6b6c6d6e6f6
+};
+
 
 /* one round
  * i = round number
@@ -179,34 +178,42 @@ __m128i ALL_FF;
  */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
   /* AddRoundConstant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
+  a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
+  a1 = _mm_xor_si128( a1, b1 ); \
+  a2 = _mm_xor_si128( a2, b1 ); \
+  a3 = _mm_xor_si128( a3, b1 ); \
+  a4 = _mm_xor_si128( a4, b1 ); \
+  a5 = _mm_xor_si128( a5, b1 ); \
+  a6 = _mm_xor_si128( a6, b1 ); \
+  a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
   \
   /* ShiftBytes + SubBytes (interleaved) */\
   b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
+  a0 = _mm_shuffle_epi8( a0, m128_const_64( 0x03060a0d08020509, \
+                                            0x0c0f0104070b0e00 ) ); \
+  a0 = _mm_aesenclast_si128( a0, b0 );\
+  a1 = _mm_shuffle_epi8( a1, m128_const_64( 0x04070c0f0a03060b, \
+                                            0x0e090205000d0801 ) ); \
+  a1 = _mm_aesenclast_si128( a1, b0 );\
+  a2 = _mm_shuffle_epi8( a2, m128_const_64( 0x05000e090c04070d, \
+                                            0x080b0306010f0a02 ) ); \
+  a2 = _mm_aesenclast_si128( a2, b0 );\
+  a3 = _mm_shuffle_epi8( a3, m128_const_64( 0x0601080b0e05000f, \
+                                            0x0a0d040702090c03 ) ); \
+  a3 = _mm_aesenclast_si128( a3, b0 );\
+  a4 = _mm_shuffle_epi8( a4, m128_const_64( 0x0702090c0f060108, \
+                                            0x0b0e0500030a0d04 ) ); \
+  a4 = _mm_aesenclast_si128( a4, b0 );\
+  a5 = _mm_shuffle_epi8( a5, m128_const_64( 0x00030b0e0907020a, \
+                                            0x0d080601040c0f05 ) ); \
+  a5 = _mm_aesenclast_si128( a5, b0 );\
+  a6 = _mm_shuffle_epi8( a6, m128_const_64( 0x01040d080b00030c, \
+                                            0x0f0a0702050e0906 ) ); \
+  a6 = _mm_aesenclast_si128( a6, b0 );\
+  a7 = _mm_shuffle_epi8( a7, m128_const_64( 0x02050f0a0d01040e, \
+                                            0x090c000306080b07 ) ); \
+  a7 = _mm_aesenclast_si128( a7, b0 );\
   \
   /* MixBytes */\
   MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -235,7 +242,7 @@ __m128i ALL_FF;
  * clobbers: t0
  */
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
+  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
   \
   i0 = _mm_shuffle_epi8(i0, t0);\
   i1 = _mm_shuffle_epi8(i1, t0);\
diff --git a/algo/groestl/aes_ni/groestl256-intr-avx.h b/algo/groestl/aes_ni/groestl256-intr-avx.h
deleted file mode 100644
index 3eb8397..0000000
--- a/algo/groestl/aes_ni/groestl256-intr-avx.h
+++ /dev/null
@@ -1,482 +0,0 @@
-/* groestl-intr-avx.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-#include "hash-groestl256.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_FF;
-//#if LENGTH <= 256
-__m128i ALL_1B;
-//#else
-//__m256d ALL_1B;
-//#endif
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
-#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
-
-#define SET_CONSTANTS(){\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  j = _mm_cmpgt_epi8(z, i);\
-  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  b0 = a2;\
-  b1 = a3;\
-  b2 = a4;\
-  b3 = a5;\
-  b4 = a6;\
-  b5 = a7;\
-  b6 = a0;\
-  b7 = a1;\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  a0 = _mm_xor_si128(a0, a1);\
-  a1 = _mm_xor_si128(a1, a2);\
-  a2 = _mm_xor_si128(a2, a3);\
-  a3 = _mm_xor_si128(a3, a4);\
-  a4 = _mm_xor_si128(a4, a5);\
-  a5 = _mm_xor_si128(a5, a6);\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b3 = _mm_xor_si128(b3, a7);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b5 = _mm_xor_si128(b5, a1);\
-  b6 = _mm_xor_si128(b6, a2);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  b0 = _mm_xor_si128(b0, a6);\
-  b1 = _mm_xor_si128(b1, a7);\
-  b2 = _mm_xor_si128(b2, a0);\
-  b3 = _mm_xor_si128(b3, a1);\
-  b4 = _mm_xor_si128(b4, a2);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  TEMP1 = b1;\
-  TEMP2 = b2;\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b1 = a1;\
-  TEMP3 = a2;\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP3);\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  b1 = ALL_1B;\
-  b2 = _mm_xor_si128(b2, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  a0 = _mm_xor_si128(a0, TEMP0);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
-  a2 = _mm_xor_si128(a2, TEMP2);\
-  a3 = _mm_xor_si128(a3, b3);\
-  a4 = _mm_xor_si128(a4, b4);\
-  a5 = _mm_xor_si128(a5, b5);\
-  a6 = _mm_xor_si128(a6, b6);\
-  a7 = _mm_xor_si128(a7, b7);\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  b0 = _mm_xor_si128(a3, TEMP0);\
-  b1 = _mm_xor_si128(a4, TEMP1);\
-  b2 = _mm_xor_si128(a5, TEMP2);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b6 = _mm_xor_si128(b6, a1);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* Add Round Constant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  \
-  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
-  \
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-  \
-  o1 = _mm_unpackhi_epi16(i0, i1);\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  t0 = _mm_unpackhi_epi16(i2, i3);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  \
-  o2 = _mm_unpackhi_epi32(i0, i2);\
-  o3 = _mm_unpackhi_epi32(o1, t0);\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = _mm_unpackhi_epi64(i0, i4);\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o2 = _mm_unpacklo_epi64(i1, i5);\
-  o3 = _mm_unpackhi_epi64(i1, i5);\
-  o4 = _mm_unpacklo_epi64(i2, i6);\
-  o5 = _mm_unpackhi_epi64(i2, i6);\
-  o6 = _mm_unpacklo_epi64(i3, i7);\
-  o7 = _mm_unpackhi_epi64(i3, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = _mm_unpackhi_epi64(i0, i1);\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o1 = _mm_unpackhi_epi64(i2, i3);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o2 = _mm_unpackhi_epi64(i4, i5);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o3 = _mm_unpackhi_epi64(i6, i7);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = _mm_unpackhi_epi64(i0, t0);\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i3 = _mm_unpackhi_epi64(i2, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i5 = _mm_unpackhi_epi64(i4, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i7 = _mm_unpackhi_epi64(i6, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
-  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
-  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
-  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
-  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
-  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
-  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-}
-
-
diff --git a/algo/groestl/aes_ni/groestl256-intr-vperm.h b/algo/groestl/aes_ni/groestl256-intr-vperm.h
deleted file mode 100644
index f6baa17..0000000
--- a/algo/groestl/aes_ni/groestl256-intr-vperm.h
+++ /dev/null
@@ -1,793 +0,0 @@
-/* groestl-intr-vperm.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include <tmmintrin.h>
-#include "hash-groestl256.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_0F;
-__m128i ALL_15;
-__m128i ALL_1B;
-__m128i ALL_63;
-__m128i ALL_FF;
-__m128i VPERM_IPT[2];
-__m128i VPERM_OPT[2];
-__m128i VPERM_INV[2];
-__m128i VPERM_SB1[2];
-__m128i VPERM_SB2[2];
-__m128i VPERM_SB4[2];
-__m128i VPERM_SBO[2];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
-  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
-  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
-  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
-  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
-  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
-  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
-  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
-  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
-  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
-  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
-  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
-  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
-  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
-  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  t0 = c0;\
-  t1 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t1 = _mm_andnot_si128(t1, a1);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  t1 = _mm_srli_epi32(t1, 4);\
-  a0 = _mm_and_si128(a0, c0);\
-  a1 = _mm_and_si128(a1, c0);\
-  t2 = c2;\
-  t3 = c2;\
-  t2 = _mm_shuffle_epi8(t2, a0);\
-  t3 = _mm_shuffle_epi8(t3, a1);\
-  a0 = c1;\
-  a1 = c1;\
-  a0 = _mm_shuffle_epi8(a0, t0);\
-  a1 = _mm_shuffle_epi8(a1, t1);\
-  a0 = _mm_xor_si128(a0, t2);\
-  a1 = _mm_xor_si128(a1, t3);\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  c0 = ALL_0F;\
-  c1 = ((__m128i*) table )[0];\
-  c2 = ((__m128i*) table )[1];\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  t0 = constant;\
-  a0 = _mm_xor_si128(a0,  t0);\
-  a1 = _mm_xor_si128(a1,  t0);\
-  a2 = _mm_xor_si128(a2,  t0);\
-  a3 = _mm_xor_si128(a3,  t0);\
-  a4 = _mm_xor_si128(a4,  t0);\
-  a5 = _mm_xor_si128(a5,  t0);\
-  a6 = _mm_xor_si128(a6,  t0);\
-  a7 = _mm_xor_si128(a7,  t0);\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  t0 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  a0 = _mm_and_si128(a0,  c0);\
-  b0a = c1;\
-  b0a = _mm_shuffle_epi8(b0a, a0);\
-  a0 = _mm_xor_si128(a0,  t0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t0);\
-  b0b = _mm_xor_si128(b0b, b0a);\
-  t1 = c2;\
-  t1 = _mm_shuffle_epi8(t1,  a0);\
-  t1 = _mm_xor_si128(t1,  b0a);\
-  b0a = c2;\
-  b0a = _mm_shuffle_epi8(b0a, b0b);\
-  b0a = _mm_xor_si128(b0a, a0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t1);\
-  b0b = _mm_xor_si128(b0b, t0);\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  b0 = ((__m128i*) table )[0];\
-  t0 = ((__m128i*) table )[1];\
-  b0 = _mm_shuffle_epi8(b0, a0b);\
-  t0 = _mm_shuffle_epi8(t0, a0a);\
-  b0 = _mm_xor_si128(b0, t0);\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[1] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[1] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[2] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[2] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[3] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[3] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[5] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[5] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[6] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[6] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[7] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[4] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  TEMP_MUL2[0] = c2;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  TEMP_MUL4 = a3;\
-  /* 1 */\
-  b1 = a0;\
-  b1 = _mm_xor_si128(b1, a5);\
-  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
-  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
-  b2 = b1;\
-  \
-  /* 2 */\
-  b5 = a1;\
-  b5 = _mm_xor_si128(b5, a4);\
-  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
-  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
-  b6 = b5;\
-  \
-  /* 4 */\
-  b7 = _mm_xor_si128(b7, a6);\
-  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
-  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
-  b2 = _mm_xor_si128(b2, b7);\
-  \
-  /* 3 */\
-  b0 = _mm_xor_si128(b0, a7);\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
-  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
-  b3 = b0;\
-  b1 = _mm_xor_si128(b1, b0);\
-  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
-  \
-  /* 5 */\
-  b4 = _mm_xor_si128(b4, a2);\
-  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
-  b3 = _mm_xor_si128(b3, b4);\
-  b6 = _mm_xor_si128(b6, b4);\
-  \
-  /* 6 */\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
-  b4 = _mm_xor_si128(b4, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  /* 7 */\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
-  b2 = _mm_xor_si128(b2, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
-  \
-  /* 8 */\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
-  b6 = _mm_xor_si128(b6, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* 9 */\
-  a3 = TEMP_MUL1[2];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
-  b0 = _mm_xor_si128(b0, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 10 */\
-  a1 = TEMP_MUL1[6];\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
-  b1 = _mm_xor_si128(b1, a1);\
-  b4 = _mm_xor_si128(b4, a1);\
-  \
-  /* 11 */\
-  a5 = TEMP_MUL1[3];\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
-  b1 = _mm_xor_si128(b1, a5);\
-  b6 = _mm_xor_si128(b6, a5);\
-  \
-  /* 12 */\
-  a3 = TEMP_MUL1[7];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
-  b2 = _mm_xor_si128(b2, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 13 */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a4);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/**/
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}/**/
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a4 = _mm_xor_si128(a4, b1);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-\
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-\
-  o1 = i0;\
-  t0 = i2;\
-\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
-\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-\
-  o2 = i0;\
-  o3 = o1;\
-\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = i0;\
-  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
-  o3 = i1;\
-  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
-  o5 = i2;\
-  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
-  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
-  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
-  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
-  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = i0;\
-  i3 = i2;\
-  i5 = i4;\
-  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  xmm0 = ROUND_CONST_L0[i];\
-  xmm1 = ROUND_CONST_L7[i];\
-  xmm2 = ROUND_CONST_L0[j];\
-  xmm3 = ROUND_CONST_L7[j];\
-  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
-  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
-  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
-  ROUND_CONST_L0[i] = xmm0;\
-  ROUND_CONST_L7[i] = xmm1;\
-  ROUND_CONST_L0[j] = xmm2;\
-  ROUND_CONST_L7[j] = xmm3;\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  xmm0 = ROUND_CONST_Lx;\
-  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  ROUND_CONST_Lx = xmm0;\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT256(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  xmm8 = chaining[0];
-  xmm0 = chaining[1];
-  xmm4 = chaining[2];
-  xmm5 = chaining[3];
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-
-  return;
-}//OF512()
-
-
-
diff --git a/algo/groestl/aes_ni/hash-groestl.c b/algo/groestl/aes_ni/hash-groestl.c
index cf680e4..d26ef27 100644
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -16,48 +16,13 @@
 
 #ifdef __AES__
 
-#include "groestl-version.h"
-
-#ifdef TASM
-  #ifdef VAES
-    #include "groestl-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
+#include "groestl-intr-aes.h"
 
 HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 {
   int i;
 
   ctx->hashlen = hashlen;
-  SET_CONSTANTS();
 
   if (ctx->chaining == NULL || ctx->buffer == NULL)
     return FAIL_GR;
@@ -70,8 +35,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 
   // The only non-zero in the IV is len. It can be hard coded.
   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT(ctx->chaining);
 
   ctx->buf_ptr = 0;
   ctx->rem_ptr = 0;
@@ -92,8 +55,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
      ctx->buffer[i]   = _mm_setzero_si128();
   }
   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT(ctx->chaining);
   ctx->buf_ptr = 0;
   ctx->rem_ptr = 0;
 
@@ -109,7 +70,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 // 5. Midstate will work at reduced impact than full hash, if total hash
 //    (midstate + tail) is less than 1 block.
 //    This, unfortunately, is the case with all current users.
-// 6. the morefull blocks the bigger the gain
+// 6. the more full blocks the bigger the gain
 
 // use only for midstate precalc
 HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
@@ -143,12 +104,11 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
 // deprecated do not use
 HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
 {
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
-   const int blocks = ctx->blk_count + 1;       // adjust for final block
-
-   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i;  // where in buffer
+   const int len = (int)ctx->databitlen / 128; // bits to __m128i 
+   const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
+   const int rem_ptr = ctx->rem_ptr;           // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;     // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
    int i;
 
    // first pad byte = 0x80, last pad byte = block count
@@ -157,21 +117,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
    if ( rem_ptr == len - 1 )
    {
        // only 128 bits left in buffer, all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                                  0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
    }
    else
    {
        // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                            0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
        // add zero padding
        for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
            ctx->buffer[i] = _mm_setzero_si128();
 
        // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
    }
 
    // digest final padding block and do output transform
@@ -189,21 +146,20 @@ int groestl512_full( hashState_groestl* ctx, void* output,
                                 const void* input, uint64_t databitlen )
 {
 
-  int i;
-
-  ctx->hashlen = 64;
-  SET_CONSTANTS();
-
-  for ( i = 0; i < SIZE512; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
+   int i;
+   ctx->hashlen = 64;
 
+   for ( i = 0; i < SIZE512; i++ )
+   {
+      ctx->chaining[i] = _mm_setzero_si128();
+      ctx->buffer[i]   = _mm_setzero_si128();
+   }
+   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;
 
+   // --- update ---
+   
    const int len = (int)databitlen / 128;
    const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
    const int hash_offset = SIZE512 - hashlen_m128i;
@@ -211,8 +167,6 @@ int groestl512_full( hashState_groestl* ctx, void* output,
    uint64_t blocks = len / SIZE512;
    __m128i* in = (__m128i*)input;
 
-   // --- update ---
-
    // digest any full blocks, process directly from input 
    for ( i = 0; i < blocks; i++ )
       TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -231,26 +185,22 @@ int groestl512_full( hashState_groestl* ctx, void* output,
    if ( i == len -1 )
    {
        // only 128 bits left in buffer, all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
    }
    else
    {
        // add first padding
-       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                      0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[i] = m128_const_64( 0, 0x80 );
        // add zero padding
        for ( i += 1; i < SIZE512 - 1; i++ )
            ctx->buffer[i] = _mm_setzero_si128();
 
        // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); 
    }
 
    // digest final padding block and do output transform
    TF1024( ctx->chaining, ctx->buffer );
-
    OF1024( ctx->chaining );
 
    // store hash result in output 
@@ -268,7 +218,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
    const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
    const int hash_offset = SIZE512 - hashlen_m128i;
    int rem = ctx->rem_ptr;
-   int blocks = len / SIZE512;
+   uint64_t blocks = len / SIZE512;
    __m128i* in = (__m128i*)input;
    int i;
 
@@ -292,26 +242,22 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
    if ( i == len -1 )
    {        
        // only 128 bits left in buffer, all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
    }   
    else
    {
        // add first padding
-       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
-                                      0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[i] = m128_const_64( 0, 0x80 );
        // add zero padding
        for ( i += 1; i < SIZE512 - 1; i++ )
            ctx->buffer[i] = _mm_setzero_si128();
 
        // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
    }
 
    // digest final padding block and do output transform
    TF1024( ctx->chaining, ctx->buffer );
-
    OF1024( ctx->chaining );
 
    // store hash result in output 
diff --git a/algo/groestl/aes_ni/hash-groestl256.c b/algo/groestl/aes_ni/hash-groestl256.c
index ac6e5f5..34a37b1 100644
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -13,41 +13,7 @@
 
 #ifdef __AES__
 
-#include "groestl-version.h"
-
-#ifdef TASM
-  #ifdef VAES
-    #include "groestl256-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl256-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl256-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl256-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl256-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl256-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
+#include "groestl256-intr-aes.h"
 
 /* initialise context */
 HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
@@ -55,7 +21,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
   int i;
 
   ctx->hashlen = hashlen;
-  SET_CONSTANTS();
 
   if (ctx->chaining == NULL || ctx->buffer == NULL)
     return FAIL_GR;
diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c
index 0193210..3f4b671 100644
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -53,7 +53,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
       n += 8;
 
    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce + 1;
    return 0;
 }
@@ -104,7 +104,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                                   m256_const1_64( 0x0000000400000000 ) );
       n += 4;
    } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce + 1;
    return 0;
 }
diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c
index 568a5da..282ae91 100644
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -74,7 +74,7 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 bool register_sha3d_algo( algo_gate_t* gate )
 {
   hard_coded_eb = 6;
-  opt_extranonce = false;
+//  opt_extranonce = false;
   gate->optimizations = AVX2_OPT | AVX512_OPT;
   gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
 #if defined (KECCAK_8WAY)
diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c
index dfd4320..41259e8 100644
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -46,7 +46,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
       sha3d_hash_8way( hash, vdata );
 
       for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) 
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
       {
           extr_lane_8x64( lane_hash, hash, lane, 256 );
           if ( valid_hash( lane_hash, ptarget ) )
@@ -59,8 +59,8 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
                                   m512_const1_64( 0x0000000800000000 ) );
       n += 8;
 
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
@@ -105,7 +105,7 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
       sha3d_hash_4way( hash, vdata );
 
       for ( int lane = 0; lane < 4; lane++ )
-      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
       {
           extr_lane_4x64( lane_hash, hash, lane, 256 );
           if ( valid_hash( lane_hash, ptarget ) )
@@ -117,8 +117,8 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
       *noncev = _mm256_add_epi32( *noncev,
                                   m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c
index 09fbe13..780e56d 100644
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -344,17 +344,12 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
 
     // 16 byte partial block exists for 80 byte len
     if ( state->rembytes  )
-    {
-      // padding of partial block
-      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
+       // padding of partial block
+       rnd512( state, m128_const_64( 0, 0x80000000 ),
                       mm128_bswap_32( cast_m128i( data ) ) );
-    }
     else
-    {
-      // empty pad block
-     rnd512( state, _mm_setzero_si128(), 
-                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
-    }
+       // empty pad block
+       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
 
     finalization512( state, (uint32*) output );
     if ( state->hashbitlen > 512 )
@@ -363,6 +358,56 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
     return SUCCESS;
 }
 
+
+int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
+              const BitSequence* data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    int i;
+    state->hashbitlen = hashbitlen;
+    /* set the lower 32 bits to '1' */
+    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
+    /* set all bits to '1' */
+    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
+    /* set the 32-bit round constant values to the 128-bit data field */
+    for ( i=0; i<32; i++ )
+        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
+    for ( i=0; i<10; i++ )
+    state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
+    memset(state->buffer, 0, sizeof state->buffer );
+
+    // update
+
+    int blocks = (int)( inlen / 32 );
+    state->rembytes = inlen % 32;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++ )
+    {
+       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
+                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       data += MSG_BLOCK_BYTE_LEN;
+    }
+
+    // final
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+       // padding of partial block
+       rnd512( state, m128_const_64( 0, 0x80000000 ),
+                      mm128_bswap_32( cast_m128i( data ) ) );
+    else
+       // empty pad block
+       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
+
+    finalization512( state, (uint32*) output );
+    if ( state->hashbitlen > 512 )
+        finalization512( state, (uint32*)( output+128 ) );
+
+    return SUCCESS;
+}
+
+
 /***************************************************/
 /* Round function         */
 /* state: hash context    */
diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h
index d21b34c..5d0cb75 100644
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -65,5 +65,6 @@ HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
 HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
                                    const BitSequence* data, size_t inlen );
 
-
+int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
+                                   const BitSequence* data, size_t inlen );
 
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index e29419a..203ac89 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -280,14 +280,15 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
      allium_16way_hash( hash, vdata );
 
      for ( int lane = 0; lane < 16; lane++ ) 
-     if unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench )
+     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
      {
          pdata[19] = bswap_32( n + lane );
          submit_lane_solution( work, hash+(lane<<3), mythr, lane );
      }
      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
      n += 16;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
@@ -318,7 +319,6 @@ void allium_8way_hash( void *hash, const void *input )
 {
    uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
    uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
-//   uint64_t hash[4*8] __attribute__ ((aligned (64)));
    uint64_t *hash0 = (uint64_t*)hash;
    uint64_t *hash1 = (uint64_t*)hash+ 4;
    uint64_t *hash2 = (uint64_t*)hash+ 8;
@@ -443,7 +443,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
      for ( int lane = 0; lane < 8; lane++ )
      {
         const uint64_t *lane_hash = hash + (lane<<2);
-        if unlikely( valid_hash( lane_hash, ptarget ) && !bench )
+        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
            pdata[19] = bswap_32( n + lane );
            submit_lane_solution( work, lane_hash, mythr, lane );
@@ -451,7 +451,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
      }
      n += 8;
      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
-   } while likely( (n <= last_nonce) && !work_restart[thr_id].restart );
+   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
    pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
index b6b90fe..16eb23c 100644
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -194,7 +194,7 @@ bool register_allium_algo( algo_gate_t* gate )
 
 /////////////////////////////////////////
 
-bool phi2_has_roots;
+bool phi2_has_roots = false;
 bool phi2_use_roots = false;
 
 int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h
index 28811a6..89ae6da 100644
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -189,7 +189,7 @@ bool init_allium_ctx();
 //  #define PHI2_4WAY
 #endif
 
-bool phi2_has_roots;
+extern bool phi2_has_roots;
 
 bool register_phi2_algo( algo_gate_t* gate );
 #if defined(PHI2_4WAY)
diff --git a/algo/simd/nist.c b/algo/simd/nist.c
index 17b86a6..fbd4e71 100644
--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -360,18 +360,116 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval,
   return SUCCESS;
 }
 
+int simd_full( hashState_sd *state, BitSequence *hashval,
+                            const BitSequence *data, DataLength databitlen )
+{
+ 
 
-/*HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
-                BitSequence *hashval) {
-  hashState_sd s;
-  HashReturn r;
-  r = Init(&s, hashbitlen);
-  if (r != SUCCESS)
-    return r;
-  r = Update(&s, data, databitlen);
-  if (r != SUCCESS)
-    return r;
-  r = Final(&s, hashval);
-  return r;
+  InitIV( state, 512, IV_512 );
+ 
+  int current, i;
+  unsigned int bs = state->blocksize;
+  static int align = -1;
+  BitSequence out[64];
+  int isshort = 1;
+  u64 l;
+
+  if (align == -1)
+    align = RequiredAlignment();
+
+#ifdef HAS_64
+  current = state->count & (bs - 1);
+#else
+  current = state->count_low & (bs - 1);
+#endif
+
+  if ( current & 7 )
+  {
+    // The number of hashed bits is not a multiple of 8.
+    // Very painfull to implement and not required by the NIST API.
+    return FAIL;
+  }
+
+  while ( databitlen > 0 )
+  {
+    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
+    {
+       // We can hash the data directly from the input buffer.
+      SIMD_Compress(state, data, 0);
+      databitlen -= bs;
+      data += bs/8;
+      IncreaseCounter(state, bs);
+    }
+    else
+    {
+       // Copy a chunk of data to the buffer
+      unsigned int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
+        IncreaseCounter( state, databitlen );
+        break;
+      }
+      else
+      {
+        memcpy( state->buffer+current/8, data, len/8 );
+        IncreaseCounter( state,len );
+        databitlen -= len;
+        data += len/8;
+        current = 0;
+        SIMD_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+
+  current = state->count & (state->blocksize - 1);
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    // We first need to zero out the end of the buffer.
+    if ( current & 7 )
+    {
+      BitSequence mask = 0xff >> ( current & 7 );
+      state->buffer[current/8] &= ~mask;
+    }
+    current = ( current+7 ) / 8;
+    memset( state->buffer+current, 0, state->blocksize/8 - current );
+    SIMD_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, state->blocksize / 8 );
+  l = state->count;
+  for ( i=0; i<8; i++ )
+  {
+    state->buffer[i] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_Compress( state, state->buffer, isshort );
+
+  // Decode the 32-bit words into a BitSequence
+  for ( i=0; i < 2*state->n_feistels; i++ )
+  {
+    u32 x = state->A[i];
+    out[4*i  ] = x & 0xff;
+    x >>= 8;
+    out[4*i+1] = x & 0xff;
+    x >>= 8;
+    out[4*i+2] = x & 0xff;
+    x >>= 8;
+    out[4*i+3] = x & 0xff;
+  }
+
+  memcpy( hashval, out, state->hashbitlen / 8 );
+  if ( state->hashbitlen % 8 )
+  {
+    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
+    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
+  }
+  return SUCCESS;
 }
-*/
+
diff --git a/algo/simd/nist.h b/algo/simd/nist.h
index f4b017d..b4737ff 100644
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -47,8 +47,8 @@ HashReturn final_sd(hashState_sd *state, BitSequence *hashval);
 HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval,
                             const BitSequence *data, DataLength databitlen );
 
-//HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
-//                BitSequence *hashval);
+int simd_full( hashState_sd *state, BitSequence *hashval,
+               const BitSequence *data, DataLength databitlen );
 
 /* 
  * Internal API
diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c
index 528f66f..142fb74 100644
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -727,7 +727,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
    {
        memcpy_256( buf + (ptr>>3), vdata, len>>3 );
        sc->ptr = ptr + len;
-       return;
+       if ( ptr < buf_size ) return;
    }
 
    READ_STATE_BIG( sc );
@@ -745,6 +745,8 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
        clen = buf_size - ptr;
        if ( clen > len )
             clen = len;
+       len -= clen;
+       if ( len == 0 ) break;
        memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
        ptr += clen;
        vdata += (clen>>3);
@@ -769,9 +771,12 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 
 	READ_STATE_BIG(sc);
 
-   memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-	et = 352 + ((bcount == 0) << 7);
-   UBI_BIG_4WAY( et, ptr );
+   if ( ptr )
+   {
+      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+	   et = 352 + ((bcount == 0) << 7);
+      UBI_BIG_4WAY( et, ptr );
+   }
 
    memset_zero_256( buf, buf_size >> 3 );
    bcount = 0;
diff --git a/algo/x11/tribus-4way.c b/algo/x11/tribus-4way.c
index 7e56268..0092763 100644
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -17,8 +17,6 @@ static __thread jh512_8way_context ctx_mid;
 void tribus_hash_8way( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -44,6 +42,8 @@ void tribus_hash_8way( void *state, const void *input )
      keccak512_8way_close( &ctx_keccak, vhash );
 
 #if defined(__VAES__)
+     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
 
      rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
      
diff --git a/algo/x16/hex.c b/algo/x16/hex.c
index 631e428..bd2df93 100644
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -76,10 +76,13 @@ union _hex_context_overlay
 };
 typedef union _hex_context_overlay hex_context_overlay;
 
+static __thread hex_context_overlay hex_ctx;
+
 void hex_hash( void* output, const void* input )
 {
    uint32_t _ALIGN(128) hash[16];
    hex_context_overlay ctx;
+   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
    void *in = (void*) input;
    int size = 80;
 /*
@@ -109,23 +112,21 @@ void hex_hash( void* output, const void* input )
          break;
          case GROESTL:
 #if defined(__AES__)
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                      (const char*)in, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
 #else
             sph_groestl512_init( &ctx.groestl );
             sph_groestl512( &ctx.groestl, in, size );
             sph_groestl512_close(&ctx.groestl, hash);
 #endif
          break;
-         case SKEIN:
-            sph_skein512_init( &ctx.skein );
-            sph_skein512( &ctx.skein, in, size );
-            sph_skein512_close( &ctx.skein, hash );
-         break;
          case JH:
-            sph_jh512_init( &ctx.jh );
-            sph_jh512(&ctx.jh, in, size );
+            if ( i == 0 )
+               sph_jh512(&ctx.jh, in+64, 16 );
+            else
+            {   
+               sph_jh512_init( &ctx.jh );
+               sph_jh512(&ctx.jh, in, size );
+            }
             sph_jh512_close(&ctx.jh, hash );
          break;
          case KECCAK:
@@ -133,15 +134,37 @@ void hex_hash( void* output, const void* input )
             sph_keccak512( &ctx.keccak, in, size );
             sph_keccak512_close( &ctx.keccak, hash );
          break;
-         case LUFFA:
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                    (const BitSequence*)in, size );
+         case SKEIN:
+            if ( i == 0 )
+               sph_skein512(&ctx.skein, in+64, 16 );
+            else
+            {
+               sph_skein512_init( &ctx.skein );
+               sph_skein512( &ctx.skein, in, size );
+            }
+            sph_skein512_close( &ctx.skein, hash );
          break;
+         case LUFFA:
+            if ( i == 0 )
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                          (const BitSequence*)in+64, 16 );
+            else
+            {
+               init_luffa( &ctx.luffa, 512 );
+               update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                             (const BitSequence*)in, size );
+            }
+            break;
          case CUBEHASH:
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                  (const byte*)in, size );
+            if ( i == 0 )
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash,
+                                          (const byte*)in+64, 16 );
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash,
+                                          (const byte*)in, size );
+            }
          break;
          case SHAVITE:
             sph_shavite512_init( &ctx.shavite );
@@ -155,9 +178,8 @@ void hex_hash( void* output, const void* input )
          break;
          case ECHO:
 #if defined(__AES__)
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                (const BitSequence*)in, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash, 512,
+                              (const BitSequence *)in, size );
 #else
              sph_echo512_init( &ctx.echo );
              sph_echo512( &ctx.echo, in, size );
@@ -165,9 +187,14 @@ void hex_hash( void* output, const void* input )
 #endif
          break;
          case HAMSI:
-             sph_hamsi512_init( &ctx.hamsi );
-             sph_hamsi512( &ctx.hamsi, in, size );
-             sph_hamsi512_close( &ctx.hamsi, hash );
+            if ( i == 0 ) 
+               sph_hamsi512( &ctx.hamsi, in+64, 16 );
+            else
+            {
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, in, size );
+            }
+            sph_hamsi512_close( &ctx.hamsi, hash );
          break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
@@ -175,14 +202,24 @@ void hex_hash( void* output, const void* input )
              sph_fugue512_close( &ctx.fugue, hash );
          break;
          case SHABAL:
-             sph_shabal512_init( &ctx.shabal );
-             sph_shabal512( &ctx.shabal, in, size );
-             sph_shabal512_close( &ctx.shabal, hash );
+            if ( i == 0 ) 
+               sph_shabal512( &ctx.shabal, in+64, 16 );
+            else
+            {
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, in, size );
+            }
+            sph_shabal512_close( &ctx.shabal, hash );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash );
+            if ( i == 0 ) 
+                sph_whirlpool( &ctx.whirlpool, in+64, 16 );
+            else
+            {
+                sph_whirlpool_init( &ctx.whirlpool );
+                sph_whirlpool( &ctx.whirlpool, in, size );
+            }
+            sph_whirlpool_close( &ctx.whirlpool, hash );
          break;
          case SHA_512:
              SHA512_Init( &ctx.sha512 );
@@ -201,47 +238,77 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t _ALIGN(128) edata[20];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const uint32_t last_nonce = max_nonce - 4;
+   const int thr_id = mythr->id;
    uint32_t nonce = first_nonce;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;
 
-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 
    uint32_t ntime = swab32(pdata[17]);
    if ( s_ntime != ntime )
    {
-      hex_getAlgoString( (const uint32_t*) (&endiandata[1]), hashOrder );
+      hex_getAlgoString( (const uint32_t*) (&edata[1]), hashOrder );
       s_ntime = ntime;
       if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
    }
 
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         sph_jh512_init( &hex_ctx.jh );
+         sph_jh512( &hex_ctx.jh, edata, 64 );
+      break;
+      case SKEIN:
+         sph_skein512_init( &hex_ctx.skein );
+         sph_skein512( &hex_ctx.skein, edata, 64 );
+      break;
+      case CUBEHASH:
+         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &hex_ctx.cube, (const byte*)edata, 64 );
+      break;
+      case HAMSI:
+         sph_hamsi512_init( &hex_ctx.hamsi );
+         sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
+      break;
+      case SHABAL:
+         sph_shabal512_init( &hex_ctx.shabal );
+         sph_shabal512( &hex_ctx.shabal, edata, 64 );
+      break;
+      case WHIRLPOOL:
+         sph_whirlpool_init( &hex_ctx.whirlpool );
+         sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
+      break;
+   }
+   
    do
    {
-      be32enc( &endiandata[19], nonce );
-      hex_hash( hash32, endiandata );
+      edata[19] = nonce;
+      hex_hash( hash32, edata );
 
-      if ( hash32[7] <= Htarg )
-      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
       {
-         pdata[19] = nonce;
+         be32enc( &pdata[19], nonce );
          submit_solution( work, hash32, mythr );
       }
       nonce++;
-   } while ( nonce < max_nonce && !(*restart) );
+   } while ( nonce < last_nonce && !(*restart) );
    pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = pdata[19] - first_nonce;
    return 0;
 }
+
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 73f15fd..f31820b 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -17,6 +17,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -32,11 +33,11 @@
   #include "algo/echo/echo-hash-4way.h"
 #endif
 
+#if defined (X16R_8WAY)
+
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 
-#if defined (X16R_8WAY)
-
 union _x16r_8way_context_overlay
 {
     blake512_8way_context   blake;
@@ -45,7 +46,8 @@ union _x16r_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cube_4way_context       cube;
+    cubehashParam           cube;
+//    cube_4way_context       cube;
     simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -65,19 +67,21 @@ union _x16r_8way_context_overlay
 
 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
 
+static __thread x16r_8way_context_overlay x16r_ctx;
 
 void x16r_8way_hash( void* output, const void* input )
 {
-   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[20] __attribute__ ((aligned (64)));
+   uint32_t hash1[20] __attribute__ ((aligned (64)));
+   uint32_t hash2[20] __attribute__ ((aligned (64)));
+   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t hash4[20] __attribute__ ((aligned (64)));
+   uint32_t hash5[20] __attribute__ ((aligned (64)));
+   uint32_t hash6[20] __attribute__ ((aligned (64)));
+   uint32_t hash7[20] __attribute__ ((aligned (64)));
    x16r_8way_context_overlay ctx;
+   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -143,28 +147,14 @@ void x16r_8way_hash( void* output, const void* input )
             groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 );
 #endif
          break;
-         case SKEIN:
-            skein512_8way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
          case JH:
-            jh512_8way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input, size );
+               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                             size<<3 );
+               jh512_8way_init( &ctx.jh );
                jh512_8way_update( &ctx.jh, vhash, size );
             }
             jh512_8way_close( &ctx.jh, vhash );
@@ -185,21 +175,97 @@ void x16r_8way_hash( void* output, const void* input )
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_init( &ctx.skein );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
          case LUFFA:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            if ( i == 0 )
+            {
+                intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+                luffa_4way_update_close( &ctx.luffa, vhash,
+                                                     vhash + (16<<2), 16 );
+                dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+                memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+                intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+                luffa_4way_update_close( &ctx.luffa, vhash, 
+                                                     vhash + (16<<2), 16 );
+                dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            }
+            else
+            {
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            }
          break;
          case CUBEHASH:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_full( &ctx.cube, vhash, 512, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                            (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                            (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                            (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                            (const byte*)in3 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
+                                            (const byte*)in4 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
+                                            (const byte*)in5 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
+                                            (const byte*)in6 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
+                                            (const byte*)in7 + 64, 16 );
+            }
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                             (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                             (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                             (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                             (const byte*)in3, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
+                                             (const byte*)in4, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
+                                             (const byte*)in5, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
+                                             (const byte*)in6, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
+                                             (const byte*)in7, size );
+            }
          break;
          case SHAVITE:
 #if defined(__VAES__)
@@ -272,13 +338,17 @@ void x16r_8way_hash( void* output, const void* input )
 #endif
          break;
          case HAMSI:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+            if ( i == 0 )
+               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-
-             hamsi512_8way_init( &ctx.hamsi );
-             hamsi512_8way_update( &ctx.hamsi, vhash, size );
-             hamsi512_8way_close( &ctx.hamsi, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+               hamsi512_8way_init( &ctx.hamsi );
+               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_8way_close( &ctx.hamsi, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
          case FUGUE:
@@ -309,38 +379,72 @@ void x16r_8way_hash( void* output, const void* input )
          break;
          case SHABAL:
              intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                          size<<3 );
-             shabal512_8way_init( &ctx.shabal );
-             shabal512_8way_update( &ctx.shabal, vhash, size );
+                             size<<3 );
+             if ( i == 0 )
+                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+             else
+             {
+                shabal512_8way_init( &ctx.shabal );
+                shabal512_8way_update( &ctx.shabal, vhash, size );
+             }
              shabal512_8way_close( &ctx.shabal, vhash );
              dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash4 ); 
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash5 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash6 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in0, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in1, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in2, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in3, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in4, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash4 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in5, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash5 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in6, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash6 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in7, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
          break;
          case SHA_512:
              sha512_8way_init( &ctx.sha512 );
@@ -355,7 +459,7 @@ void x16r_8way_hash( void* output, const void* input )
              sha512_8way_close( &ctx.sha512, vhash );
              dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                                hash7, vhash );
-         break;
+          break;
       }
       size = 64;
    }
@@ -373,23 +477,22 @@ void x16r_8way_hash( void* output, const void* input )
 int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t bedata1[2] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   if ( bench )   ptarget[7] = 0x0cff;
 
    bedata1[0] = bswap_32( pdata[1] );
    bedata1[1] = bswap_32( pdata[2] );
@@ -402,32 +505,84 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
               applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
    }
 
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         jh512_8way_init( &x16r_ctx.jh );
+         jh512_8way_update( &x16r_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         skein512_8way_init( &x16r_ctx.skein );
+         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
+         luffa_4way_init( &x16r_ctx.luffa, 512 );
+         luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
+         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 ); 
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         hamsi512_8way_init( &x16r_ctx.hamsi );
+         hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
+         shabal512_8way_init( &x16r_ctx.shabal );
+         shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 );
+         rintrlv_8x32_8x64( vdata, vdata2, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16r_ctx.whirlpool );
+         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   }
+   
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
    do
    {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
       x16r_8way_hash( hash, vdata );
-      pdata[19] = n;
 
       for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
 
-
 #elif defined (X16R_4WAY)
 
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
 union _x16r_4way_context_overlay
 {
     blake512_4way_context   blake;
@@ -438,6 +593,7 @@ union _x16r_4way_context_overlay
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
+    hashState_luffa         luffa1;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
     simd_2way_context       simd;
@@ -449,14 +605,17 @@ union _x16r_4way_context_overlay
 } __attribute__ ((aligned (64)));
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
 
+static __thread x16r_4way_context_overlay x16r_ctx;
+
 void x16r_4way_hash( void* output, const void* input )
 {
-   uint32_t vhash[24*4] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
+   uint32_t hash0[20] __attribute__ ((aligned (64)));
+   uint32_t hash1[20] __attribute__ ((aligned (64)));
+   uint32_t hash2[20] __attribute__ ((aligned (64)));
+   uint32_t hash3[20] __attribute__ ((aligned (64)));
    x16r_4way_context_overlay ctx;
+   memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -500,25 +659,13 @@ void x16r_4way_hash( void* output, const void* input )
             groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
             groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
          break;
-         case SKEIN:
-            skein512_4way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_update( &ctx.skein, vhash, size );
-            }
-            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
-         break;
          case JH:
-            jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way_init( &ctx.jh );
                jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
@@ -536,27 +683,68 @@ void x16r_4way_hash( void* output, const void* input )
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way_init( &ctx.skein );
+               skein512_4way_update( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+         break;
          case LUFFA:
-            intrlv_2x128( vhash, in0, in1, size<<3 );
-            luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128_512( hash0, hash1, vhash );
-            intrlv_2x128( vhash, in2, in3, size<<3 );
-            luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128_512( hash2, hash3, vhash );
+            if ( i == 0 )
+            {
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+               dintrlv_2x128_512( hash0, hash1, vhash );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+               dintrlv_2x128_512( hash2, hash3, vhash );
+            }
+            else
+            {
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_2x128_512( hash0, hash1, vhash );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_2x128_512( hash2, hash3, vhash );
+            }
          break;
          case CUBEHASH:
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                  (const byte*)in0, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                  (const byte*)in1, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                  (const byte*)in2, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                  (const byte*)in3, size );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                          (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                          (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                          (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                          (const byte*)in3 + 64, 16 );
+            }
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                     (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                     (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                     (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                     (const byte*)in3, size );
+            }
          break;
          case SHAVITE:
             sph_shavite512_init( &ctx.shavite );
@@ -591,11 +779,16 @@ void x16r_4way_hash( void* output, const void* input )
                               (const BitSequence *)in3, size );
          break;
          case HAMSI:
-             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-             hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way_update( &ctx.hamsi, vhash, size );
-             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+            if ( i == 0 )
+               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               hamsi512_4way_init( &ctx.hamsi );
+               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_4way_close( &ctx.hamsi, vhash );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
@@ -613,31 +806,59 @@ void x16r_4way_hash( void* output, const void* input )
          break;
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
-             shabal512_4way_init( &ctx.shabal );
-             shabal512_4way_update( &ctx.shabal, vhash, size );
+             if ( i == 0 )
+                shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+             else
+             {
+                shabal512_4way_init( &ctx.shabal );
+                shabal512_4way_update( &ctx.shabal, vhash, size );
+             }
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in0, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in1, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in2, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in3, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
          break;
          case SHA_512:
-             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-             sha512_4way_init( &ctx.sha512 );
-             sha512_4way_update( &ctx.sha512, vhash, size );
-             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+            sha512_4way_init( &ctx.sha512 );
+            if ( i == 0 )
+               sha512_4way_update( &ctx.sha512, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               sha512_4way_init( &ctx.sha512 );
+               sha512_4way_update( &ctx.sha512, vhash, size );
+            }
+            sha512_4way_close( &ctx.sha512, vhash );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
       }
       size = 64;
@@ -651,23 +872,22 @@ void x16r_4way_hash( void* output, const void* input )
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t hash[4*16] __attribute__ ((aligned (64)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t bedata1[2] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
 
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   if ( bench )  ptarget[7] = 0x0cff;
 
    bedata1[0] = bswap_32( pdata[1] );
    bedata1[1] = bswap_32( pdata[2] );
@@ -680,24 +900,72 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
               applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
    }
 
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         jh512_4way_init( &x16r_ctx.jh );
+         jh512_4way_update( &x16r_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         skein512_4way_init( &x16r_ctx.skein );
+         skein512_4way_update( &x16r_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         intrlv_2x128( vdata2, edata, edata, 640 );
+         luffa_2way_init( &x16r_ctx.luffa, 512 );
+         luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
+         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 512 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         hamsi512_4way_init( &x16r_ctx.hamsi );
+         hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm128_bswap32_intrlv80_4x32( vdata2, pdata );
+         shabal512_4way_init( &x16r_ctx.shabal );
+         shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 );
+         rintrlv_4x32_4x64( vdata, vdata2, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16r_ctx.whirlpool );
+         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   }
+
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+
    do
    {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
       x16r_4way_hash( hash, vdata );
-      pdata[19] = n;
-
       for ( int i = 0; i < 4; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
    } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index 6323589..c438c1e 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -1,5 +1,7 @@
 #include "x16r-gate.h"
 
+void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;
+
 void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
    char *sptr = output;
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index 0457bd5..f86d069 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -50,7 +50,7 @@ enum x16r_Algo {
         X16R_HASH_FUNC_COUNT
 };
 
-void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
+extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
 void x16r_getAlgoString( const uint8_t *prevblock, char *output );
 void x16s_getAlgoString( const uint8_t *prevblock, char *output );
 void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c
index e50dc01..d6da77c 100644
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -11,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -26,11 +27,11 @@
   #include "algo/echo/echo-hash-4way.h"
 #endif
 
+#if defined (X16RT_8WAY)
+
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 
-#if defined (X16RT_8WAY)
-
 union _x16rt_8way_context_overlay
 {
     blake512_8way_context   blake;
@@ -39,7 +40,8 @@ union _x16rt_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cube_4way_context       cube;
+    cubehashParam           cube;
+//    cube_4way_context       cube;
     simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -59,18 +61,21 @@ union _x16rt_8way_context_overlay
 
 typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
 
+static __thread x16rt_8way_context_overlay x16rt_ctx;
+
 void x16rt_8way_hash( void* output, const void* input )
 {
-   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[20] __attribute__ ((aligned (64)));
+   uint32_t hash1[20] __attribute__ ((aligned (64)));
+   uint32_t hash2[20] __attribute__ ((aligned (64)));
+   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t hash4[20] __attribute__ ((aligned (64)));
+   uint32_t hash5[20] __attribute__ ((aligned (64)));
+   uint32_t hash6[20] __attribute__ ((aligned (64)));
+   uint32_t hash7[20] __attribute__ ((aligned (64)));
    x16rt_8way_context_overlay ctx;
+   memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -92,18 +97,16 @@ void x16rt_8way_hash( void* output, const void* input )
       switch ( algo )
       {
          case BLAKE:
-            blake512_8way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_8way_update( &ctx.blake, input, size );
+               blake512_8way_full( &ctx.blake, vhash, input, size );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-               blake512_8way_update( &ctx.blake, vhash, size );
+               blake512_8way_full( &ctx.blake, vhash, vhash, size );
             }
-            blake512_8way_close( &ctx.blake, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
+                                 hash6, hash7, vhash );
          break;
          case BMW:
             bmw512_8way_init( &ctx.bmw );
@@ -130,54 +133,24 @@ void x16rt_8way_hash( void* output, const void* input )
             groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                 (const char*)in4, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                 (const char*)in5, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                 (const char*)in6, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                 (const char*)in7, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 );
 #endif
                break;
-         case SKEIN:
-            skein512_8way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
          case JH:
-            jh512_8way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input, size );
+               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
+               jh512_8way_init( &ctx.jh );
                jh512_8way_update( &ctx.jh, vhash, size );
             }
             jh512_8way_close( &ctx.jh, vhash );
@@ -198,35 +171,105 @@ void x16rt_8way_hash( void* output, const void* input )
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_init( &ctx.skein );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
          case LUFFA:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            if ( i == 0 )
+            {
+                intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+                luffa_4way_update_close( &ctx.luffa, vhash,
+                                                     vhash + (16<<2), 16 );
+                dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+                memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+                intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+                luffa_4way_update_close( &ctx.luffa, vhash,
+                                                     vhash + (16<<2), 16 );
+                dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            }
+            else
+            {
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            }
          break;
          case CUBEHASH:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                            (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                            (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                            (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                            (const byte*)in3 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
+                                            (const byte*)in4 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
+                                            (const byte*)in5 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
+                                            (const byte*)in6 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
+                                            (const byte*)in7 + 64, 16 );
+            }
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                             (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                             (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                             (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                             (const byte*)in3, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
+                                             (const byte*)in4, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
+                                             (const byte*)in5, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
+                                             (const byte*)in6, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
+                                             (const byte*)in7, size );
+            }
          break;
          case SHAVITE:
 #if defined(__VAES__)
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
             sph_shavite512_init( &ctx.shavite );
@@ -257,61 +300,53 @@ void x16rt_8way_hash( void* output, const void* input )
             break;
          case SIMD:
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case ECHO:
 #if defined(__VAES__)
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                                (const BitSequence*)in4, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                                (const BitSequence*)in5, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                                (const BitSequence*)in6, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                                (const BitSequence*)in7, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
+            echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                              (const BitSequence *)in4, size );
+            echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                              (const BitSequence *)in5, size );
+            echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                              (const BitSequence *)in6, size );
+            echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                              (const BitSequence *)in7, size );
 #endif
-             break;
+         break;
          case HAMSI:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+            if ( i == 0 )
+               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-
-             hamsi512_8way_init( &ctx.hamsi );
-             hamsi512_8way_update( &ctx.hamsi, vhash, size );
-             hamsi512_8way_close( &ctx.hamsi, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+               hamsi512_8way_init( &ctx.hamsi );
+               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_8way_close( &ctx.hamsi, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
-             break;
+         break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
              sph_fugue512( &ctx.fugue, in0, size );
@@ -340,48 +375,87 @@ void x16rt_8way_hash( void* output, const void* input )
          break;
          case SHABAL:
              intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                          size<<3 );
-             shabal512_8way_init( &ctx.shabal );
-             shabal512_8way_update( &ctx.shabal, vhash, size );
+                             size<<3 );
+             if ( i == 0 )
+                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+             else
+             {
+                shabal512_8way_init( &ctx.shabal );
+                shabal512_8way_update( &ctx.shabal, vhash, size );
+             }
              shabal512_8way_close( &ctx.shabal, vhash );
              dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash4 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash5 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash6 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in0, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in1, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in2, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in3, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in4, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash4 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in5, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash5 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in6, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash6 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in7, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
          break;
          case SHA_512:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
              sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, size );
+             if ( i == 0 )
+                sha512_8way_update( &ctx.sha512, input, size );
+             else
+             {
+                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                             size<<3 );
+                sha512_8way_update( &ctx.sha512, vhash, size );
+             }
              sha512_8way_close( &ctx.sha512, vhash );
              dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
+                               hash7, vhash );
+          break;
       }
       size = 64;
    }
@@ -399,23 +473,22 @@ void x16rt_8way_hash( void* output, const void* input )
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t _ALIGN(64) timeHash[8*8];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   if ( bench )   ptarget[7] = 0x0cff;
 
    uint32_t ntime = bswap_32( pdata[17] );
    if ( s_ntime != ntime )
@@ -428,31 +501,84 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                                hashOrder, ntime, timeHash );
    }
 
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         jh512_8way_init( &x16rt_ctx.jh );
+         jh512_8way_update( &x16rt_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         skein512_8way_init( &x16rt_ctx.skein );
+         skein512_8way_update( &x16rt_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
+         luffa_4way_init( &x16rt_ctx.luffa, 512 );
+         luffa_4way_update( &x16rt_ctx.luffa, vdata2, 64 );
+         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rt_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rt_ctx.cube, (const byte*)edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         hamsi512_8way_init( &x16rt_ctx.hamsi );
+         hamsi512_8way_update( &x16rt_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
+         shabal512_8way_init( &x16rt_ctx.shabal );
+         shabal512_8way_update( &x16rt_ctx.shabal, vdata2, 64 );
+         rintrlv_8x32_8x64( vdata, vdata2, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16rt_ctx.whirlpool );
+         sph_whirlpool( &x16rt_ctx.whirlpool, edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   }
+
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
    do
    {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
       x16rt_8way_hash( hash, vdata );
-      pdata[19] = n;
 
       for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
 
 #elif defined (X16RT_4WAY)
 
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
 union _x16rt_4way_context_overlay
 {
     blake512_4way_context   blake;
@@ -463,6 +589,7 @@ union _x16rt_4way_context_overlay
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
+    hashState_luffa         luffa1;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
     simd_2way_context       simd;
@@ -474,14 +601,17 @@ union _x16rt_4way_context_overlay
 };
 typedef union _x16rt_4way_context_overlay x16rt_4way_context_overlay;
 
+static __thread x16rt_4way_context_overlay x16rt_ctx;
+
 void x16rt_4way_hash( void* output, const void* input )
 {
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (64)));
+   uint32_t hash1[20] __attribute__ ((aligned (64)));
+   uint32_t hash2[20] __attribute__ ((aligned (64)));
+   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
    x16rt_4way_context_overlay ctx;
+   memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -500,15 +630,13 @@ void x16rt_4way_hash( void* output, const void* input )
       switch ( algo )
       {
          case BLAKE:
-            blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way_update( &ctx.blake, input, size );
+               blake512_4way_full( &ctx.blake, vhash, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_update( &ctx.blake, vhash, size );
+               blake512_4way_full( &ctx.blake, vhash, vhash, size );
             }
-            blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case BMW:
@@ -524,38 +652,18 @@ void x16rt_4way_hash( void* output, const void* input )
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-         break;
-         case SKEIN:
-            skein512_4way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_update( &ctx.skein, vhash, size );
-            }
-            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
          break;
          case JH:
-            jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way_init( &ctx.jh );
                jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
@@ -573,29 +681,74 @@ void x16rt_4way_hash( void* output, const void* input )
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way_init( &ctx.skein );
+               skein512_4way_update( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
          case LUFFA:
-            intrlv_2x128( vhash, in0, in1, size<<3 );
-            luffa_2way_init( &ctx.luffa, 512 );
-            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
-            intrlv_2x128( vhash, in2, in3, size<<3 );
-            luffa_2way_init( &ctx.luffa, 512 );
-            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            if ( i == 0 )
+            {
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash0,
+                                    (const BitSequence*)in0 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash1,
+                                    (const BitSequence*)in1 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash2,
+                                    (const BitSequence*)in2 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash3,
+                                    (const BitSequence*)in3 + 64, 16 );
+            }
+            else
+            {
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_2x128_512( hash0, hash1, vhash );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_2x128_512( hash2, hash3, vhash );
+            }
          break;
          case CUBEHASH:
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                  (const byte*)in0, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                  (const byte*)in1, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                  (const byte*)in2, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                  (const byte*)in3, size );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                          (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                          (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                          (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                          (const byte*)in3 + 64, 16 );
+
+            }
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                          (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                     (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                     (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                     (const byte*)in3, size );
+            }
          break;
          case SHAVITE:
             sph_shavite512_init( &ctx.shavite );
@@ -622,25 +775,26 @@ void x16rt_4way_hash( void* output, const void* input )
             dintrlv_2x128( hash2, hash3, vhash, 512 );
          break;
          case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
          break;
          case HAMSI:
-             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-             hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way_update( &ctx.hamsi, vhash, size );
-             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            if ( i == 0 )
+               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               hamsi512_4way_init( &ctx.hamsi );
+               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_4way_close( &ctx.hamsi, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
@@ -657,32 +811,59 @@ void x16rt_4way_hash( void* output, const void* input )
              sph_fugue512_close( &ctx.fugue, hash3 );
          break;
          case SHABAL:
-             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
-             shabal512_4way_init( &ctx.shabal );
-             shabal512_4way_update( &ctx.shabal, vhash, size );
-             shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+            intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
+            if ( i == 0 )
+               shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+            else
+            {
+               shabal512_4way_init( &ctx.shabal );
+               shabal512_4way_update( &ctx.shabal, vhash, size );
+            }
+            shabal512_4way_close( &ctx.shabal, vhash );
+            dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-         break;
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in0, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in1, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in2, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in3, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
+            break;
          case SHA_512:
-             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-             sha512_4way_init( &ctx.sha512 );
-             sha512_4way_update( &ctx.sha512, vhash, size );
-             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            sha512_4way_init( &ctx.sha512 );
+            if ( i == 0 )
+               sha512_4way_update( &ctx.sha512, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               sha512_4way_update( &ctx.sha512, vhash, size );
+            }
+            sha512_4way_close( &ctx.sha512, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
       }
       size = 64;
@@ -698,21 +879,21 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
 {
    uint32_t hash[4*16] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t _ALIGN(64) timeHash[4*8];
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   int thr_id = mythr->id;  
+   const int thr_id = mythr->id;  
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
+   if ( bench )  ptarget[7] = 0x0cff;
 
-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   
    uint32_t ntime = bswap_32( pdata[17] );
    if ( s_ntime != ntime )
    {
@@ -724,24 +905,71 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                                hashOrder, ntime, timeHash );
    }
 
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         jh512_4way_init( &x16rt_ctx.jh );
+         jh512_4way_update( &x16rt_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         skein512_4way_init( &x16rt_ctx.skein );
+         skein512_4way_update( &x16rt_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         init_luffa( &x16rt_ctx.luffa1, 512 );
+         update_luffa( &x16rt_ctx.luffa1, (const BitSequence*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rt_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rt_ctx.cube, (const byte*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         hamsi512_4way_init( &x16rt_ctx.hamsi );
+         hamsi512_4way_update( &x16rt_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
+         shabal512_4way_init( &x16rt_ctx.shabal );
+         shabal512_4way_update( &x16rt_ctx.shabal, vdata32, 64 );
+         rintrlv_4x32_4x64( vdata, vdata32, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16rt_ctx.whirlpool );
+         sph_whirlpool( &x16rt_ctx.whirlpool, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   }
+
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+
    do
    {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
       x16rt_4way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
-      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-   } while ( (  n < max_nonce ) && !(*restart) );
-
-   *hashes_done = n - first_nonce + 1;
+   } while ( (  n < last_nonce ) && !(*restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c
index f1f2f08..e5812c4 100644
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -33,11 +33,11 @@
   #include "algo/echo/echo-hash-4way.h"
 #endif
 
+#if defined (X16RV2_8WAY)
+
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 
-#if defined (X16RV2_8WAY)
-
 union _x16rv2_8way_context_overlay
 {
     blake512_8way_context   blake;
@@ -46,7 +46,7 @@ union _x16rv2_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cube_4way_context       cube;
+    cubehashParam           cube;
     simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -66,6 +66,7 @@ union _x16rv2_8way_context_overlay
 } __attribute__ ((aligned (64)));
 
 typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
+static __thread x16rv2_8way_context_overlay x16rv2_ctx;
 
 void x16rv2_8way_hash( void* output, const void* input )
 {
@@ -79,6 +80,7 @@ void x16rv2_8way_hash( void* output, const void* input )
    uint32_t hash6[24] __attribute__ ((aligned (64)));
    uint32_t hash7[24] __attribute__ ((aligned (64)));
    x16rv2_8way_context_overlay ctx;
+   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -102,16 +104,15 @@ void x16rv2_8way_hash( void* output, const void* input )
          case BLAKE:
             blake512_8way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_8way_update( &ctx.blake, input, size );
+               blake512_8way_full( &ctx.blake, vhash, input, size );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-               blake512_8way_update( &ctx.blake, vhash, size );
+               blake512_8way_full( &ctx.blake, vhash, vhash, size );
             }
-            blake512_8way_close( &ctx.blake, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
+                                 hash6, hash7, vhash );
          break;
          case BMW:
             bmw512_8way_init( &ctx.bmw );
@@ -130,62 +131,30 @@ void x16rv2_8way_hash( void* output, const void* input )
          case GROESTL:
 #if defined(__VAES__)
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                 (const char*)in4, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                 (const char*)in5, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                 (const char*)in6, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                 (const char*)in7, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 );
 #endif
-               break;
-         case SKEIN:
-            skein512_8way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
          break;
          case JH:
-            jh512_8way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input, size );
+               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
+               jh512_8way_init( &ctx.jh ); 
                jh512_8way_update( &ctx.jh, vhash, size );
             }
             jh512_8way_close( &ctx.jh, vhash );
@@ -193,6 +162,35 @@ void x16rv2_8way_hash( void* output, const void* input )
                           hash7, vhash );
          break;
          case KECCAK:
+             if ( i == 0 )
+             {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in2 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in3 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash3 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in4 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash4 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in5 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash5 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in6 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash6 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in7 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash7 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+             }
+             else
+             {
              sph_tiger_init( &ctx.tiger );
              sph_tiger( &ctx.tiger, in0, size );
              sph_tiger_close( &ctx.tiger, hash0 );
@@ -217,6 +215,7 @@ void x16rv2_8way_hash( void* output, const void* input )
              sph_tiger_init( &ctx.tiger );
              sph_tiger( &ctx.tiger, in7, size );
              sph_tiger_close( &ctx.tiger, hash7 );
+             }
 
              for ( int i = (24/4); i < (64/4); i++ )
                 hash0[i] = hash1[i] = hash2[i] = hash3[i] =
@@ -230,64 +229,149 @@ void x16rv2_8way_hash( void* output, const void* input )
              dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_init( &ctx.skein ); 
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
          case LUFFA:
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in0, size );
-             sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in4, size );
-             sph_tiger_close( &ctx.tiger, hash4 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in5, size );
-             sph_tiger_close( &ctx.tiger, hash5 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in6, size );
-             sph_tiger_close( &ctx.tiger, hash6 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in7, size );
-             sph_tiger_close( &ctx.tiger, hash7 );
+            if ( i == 0 )
+            {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in2 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in3 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash3 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in4 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash4 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in5 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash5 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in6 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash6 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in7 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash7 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+             }
+             else
+             {
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in0, size );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in1, size );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in2, size );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in3, size );
+                sph_tiger_close( &ctx.tiger, hash3 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in4, size );
+                sph_tiger_close( &ctx.tiger, hash4 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in5, size );
+                sph_tiger_close( &ctx.tiger, hash5 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in6, size );
+                sph_tiger_close( &ctx.tiger, hash6 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in7, size );
+                sph_tiger_close( &ctx.tiger, hash7 );
+             }
 
              for ( int i = (24/4); i < (64/4); i++ )
                 hash0[i] = hash1[i] = hash2[i] = hash3[i] = 
                 hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
 
             intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            luffa512_4way_full( &ctx.luffa, vhash, vhash, 64 );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            luffa512_4way_full( &ctx.luffa, vhash, vhash, 64 );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case CUBEHASH:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                            (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                            (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                            (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                            (const byte*)in3 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
+                                            (const byte*)in4 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
+                                            (const byte*)in5 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
+                                            (const byte*)in6 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
+                                            (const byte*)in7 + 64, 16 );
+            }
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                             (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                             (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                             (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                             (const byte*)in3, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
+                                             (const byte*)in4, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
+                                             (const byte*)in5, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
+                                             (const byte*)in6, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
+                                             (const byte*)in7, size );
+            }
          break;
          case SHAVITE:
 #if defined(__VAES__)
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
+            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
             sph_shavite512_init( &ctx.shavite );
@@ -315,100 +399,126 @@ void x16rv2_8way_hash( void* output, const void* input )
             sph_shavite512( &ctx.shavite, in7, size );
             sph_shavite512_close( &ctx.shavite, hash7 );
 #endif
-            break;
+         break;
          case SIMD:
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case ECHO:
 #if defined(__VAES__)
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                                (const BitSequence*)in4, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                                (const BitSequence*)in5, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                                (const BitSequence*)in6, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                                (const BitSequence*)in7, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
+            echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                              (const BitSequence *)in4, size );
+            echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                              (const BitSequence *)in5, size );
+            echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                              (const BitSequence *)in6, size );
+            echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                              (const BitSequence *)in7, size );
 #endif
-             break;
+         break;
          case HAMSI:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+            if ( i == 0 )
+               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
 
-             hamsi512_8way_init( &ctx.hamsi );
-             hamsi512_8way_update( &ctx.hamsi, vhash, size );
-             hamsi512_8way_close( &ctx.hamsi, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+               hamsi512_8way_init( &ctx.hamsi );
+               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_8way_close( &ctx.hamsi, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
-             break;
+         break;
          case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in0, size );
+            sph_fugue512_close( &ctx.fugue, hash0 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in1, size );
+            sph_fugue512_close( &ctx.fugue, hash1 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in2, size );
+            sph_fugue512_close( &ctx.fugue, hash2 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in3, size );
+            sph_fugue512_close( &ctx.fugue, hash3 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in4, size );
+            sph_fugue512_close( &ctx.fugue, hash4 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in5, size );
+            sph_fugue512_close( &ctx.fugue, hash5 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in6, size );
+            sph_fugue512_close( &ctx.fugue, hash6 );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in7, size );
+            sph_fugue512_close( &ctx.fugue, hash7 );
          break;
          case SHABAL:
-             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                          size<<3 );
-             shabal512_8way_init( &ctx.shabal );
-             shabal512_8way_update( &ctx.shabal, vhash, size );
-             shabal512_8way_close( &ctx.shabal, vhash );
-             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
+            intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                         size<<3 );
+            if ( i == 0 )
+                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+            else
+            {
+                shabal512_8way_init( &ctx.shabal );
+                shabal512_8way_update( &ctx.shabal, vhash, size );
+            }
+            shabal512_8way_close( &ctx.shabal, vhash );
+            dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                              hash7, vhash );
          break;
          case WHIRLPOOL:
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash4 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash5 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash6 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
+            else
+            {
              sph_whirlpool_init( &ctx.whirlpool );
              sph_whirlpool( &ctx.whirlpool, in0, size );
              sph_whirlpool_close( &ctx.whirlpool, hash0 );
@@ -433,8 +543,38 @@ void x16rv2_8way_hash( void* output, const void* input )
              sph_whirlpool_init( &ctx.whirlpool );
              sph_whirlpool( &ctx.whirlpool, in7, size );
              sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
          break;
          case SHA_512:
+             if ( i == 0 )
+             {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in2 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in3 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash3 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in4 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash4 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in5 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash5 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in6 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash6 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in7 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash7 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+             }
+             else
+             {
              sph_tiger_init( &ctx.tiger );
              sph_tiger( &ctx.tiger, in0, size );
              sph_tiger_close( &ctx.tiger, hash0 );
@@ -459,6 +599,7 @@ void x16rv2_8way_hash( void* output, const void* input )
              sph_tiger_init( &ctx.tiger );
              sph_tiger( &ctx.tiger, in7, size );
              sph_tiger_close( &ctx.tiger, hash7 );
+             }
 
              for ( int i = (24/4); i < (64/4); i++ )
                 hash0[i] = hash1[i] = hash2[i] = hash3[i] =
@@ -489,21 +630,22 @@ void x16rv2_8way_hash( void* output, const void* input )
 int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t bedata1[2] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 8;
    uint32_t n = first_nonce;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
+   if ( bench ) ptarget[7] = 0x0cff;
 
    mm512_bswap32_intrlv80_8x64( vdata, pdata );
 
@@ -515,34 +657,89 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
       x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
       s_ntime = ntime;
       if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
    }
 
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         jh512_8way_init( &x16rv2_ctx.jh );
+         jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+      case LUFFA:
+      case SHA_512:
+         mm128_bswap32_80( edata, pdata );
+         sph_tiger_init( &x16rv2_ctx.tiger );
+         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      case SKEIN:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         skein512_8way_init( &x16rv2_ctx.skein );
+         skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         hamsi512_8way_init( &x16rv2_ctx.hamsi );
+         hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
+         shabal512_8way_init( &x16rv2_ctx.shabal );
+         shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 );
+         rintrlv_8x32_8x64( vdata, vdata2, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
+         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   }
+   
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
    do
    {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
       x16rv2_8way_hash( hash, vdata );
-      pdata[19] = n;
 
       for ( int i = 0; i < 8; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
 
 #elif defined (X16RV2_4WAY)
 
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
 union _x16rv2_4way_context_overlay
 {
     blake512_4way_context   blake;
@@ -565,6 +762,8 @@ union _x16rv2_4way_context_overlay
 };
 typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
 
+static __thread x16rv2_4way_context_overlay x16rv2_ctx;
+
 // Pad the 24 bytes tiger hash to 64 bytes
 inline void padtiger512( uint32_t* hash )
 {
@@ -573,12 +772,13 @@ inline void padtiger512( uint32_t* hash )
 
 void x16rv2_4way_hash( void* output, const void* input )
 {
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (64)));
+   uint32_t hash1[20] __attribute__ ((aligned (64)));
+   uint32_t hash2[20] __attribute__ ((aligned (64)));
+   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
    x16rv2_4way_context_overlay ctx;
+   memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -595,16 +795,14 @@ void x16rv2_4way_hash( void* output, const void* input )
       switch ( algo )
       {
          case BLAKE:
-            blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way_update( &ctx.blake, input, size );
+               blake512_4way_full( &ctx.blake, vhash, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_update( &ctx.blake, vhash, size );
+               blake512_4way_full( &ctx.blake, vhash, vhash, size );
             }
-            blake512_4way_close( &ctx.blake, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case BMW:
             bmw512_4way_init( &ctx.bmw );
@@ -616,60 +814,56 @@ void x16rv2_4way_hash( void* output, const void* input )
                bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-         break;
-         case SKEIN:
-            skein512_4way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_update( &ctx.skein, vhash, size );
-            }
-            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
          break;
          case JH:
-            jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way_init( &ctx.jh );
                jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case KECCAK:
-             sph_tiger_init( &ctx.tiger );
-			    sph_tiger( &ctx.tiger, in0, size );
-			    sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-
+             if ( i == 0 )
+             {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in2 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in3 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash3 );
+             }
+             else
+             {
+                sph_tiger_init( &ctx.tiger );
+			       sph_tiger( &ctx.tiger, in0, size );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in1, size );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in2, size );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in3, size );
+                sph_tiger_close( &ctx.tiger, hash3 );
+             }
              for ( int i = (24/4); i < (64/4); i++ )
                 hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
 
@@ -679,95 +873,134 @@ void x16rv2_4way_hash( void* output, const void* input )
              keccak512_4way_close( &ctx.keccak, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way_init( &ctx.skein );
+               skein512_4way_update( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+         break;
          case LUFFA:
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in0, size );
-             sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-
+             if ( i == 0 )
+             {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in2 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in3 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash3 );
+             }
+             else
+             {
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in0, size );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in1, size );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in2, size );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in3, size );
+                sph_tiger_close( &ctx.tiger, hash3 );
+             }
              for ( int i = (24/4); i < (64/4); i++ )
-                hash0[i] = hash1[i] = 0;
+                hash0[i] = hash1[i] =  hash2[i] = hash3[i] = 0;
 
              intrlv_2x128( vhash, hash0, hash1, 512 );
              luffa_2way_init( &ctx.luffa, 512 );
              luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
              dintrlv_2x128( hash0, hash1, vhash, 512 );
-
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-
-             for ( int i = (24/4); i < (64/4); i++ )
-                hash2[i] = hash3[i] = 0;
-             
              intrlv_2x128( vhash, hash2, hash3, 512 );
              luffa_2way_init( &ctx.luffa, 512 );
              luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
              dintrlv_2x128( hash2, hash3, vhash, 512 );
          break;
          case CUBEHASH:
-             cubehashInit( &ctx.cube, 512, 16, 32 );
-             cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                   (const byte*)in0, size );
-             cubehashInit( &ctx.cube, 512, 16, 32 );
-             cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                   (const byte*)in1, size );
-             cubehashInit( &ctx.cube, 512, 16, 32 );
-             cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                   (const byte*)in2, size );
-             cubehashInit( &ctx.cube, 512, 16, 32 );
-             cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                   (const byte*)in3, size );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                            (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                             (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                             (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                             (const byte*)in3 + 64, 16 );
+            }
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                             (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                             (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                             (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                             (const byte*)in3, size );
+            }
          break;
          case SHAVITE:
-             sph_shavite512_init( &ctx.shavite );
-             sph_shavite512( &ctx.shavite, in0, size );
-             sph_shavite512_close( &ctx.shavite, hash0 );
-             sph_shavite512_init( &ctx.shavite );
-             sph_shavite512( &ctx.shavite, in1, size );
-             sph_shavite512_close( &ctx.shavite, hash1 );
-             sph_shavite512_init( &ctx.shavite );
-             sph_shavite512( &ctx.shavite, in2, size );
-             sph_shavite512_close( &ctx.shavite, hash2 );
-             sph_shavite512_init( &ctx.shavite );
-             sph_shavite512( &ctx.shavite, in3, size );
-             sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
          break;
          case SIMD:
-             intrlv_2x128( vhash, in0, in1, size<<3 );
-             simd_2way_init( &ctx.simd, 512 );
-             simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-             dintrlv_2x128( hash0, hash1, vhash, 512 );
-             intrlv_2x128( vhash, in2, in3, size<<3 );
-             simd_2way_init( &ctx.simd, 512 );
-             simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-             dintrlv_2x128( hash2, hash3, vhash, 512 );
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            simd512_2way_full( &ctx.simd, vhash, vhash, size );
+            dintrlv_2x128_512( hash0, hash1, vhash );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            simd512_2way_full( &ctx.simd, vhash, vhash, size );
+            dintrlv_2x128_512( hash2, hash3, vhash );
          break;
          case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
          break;
          case HAMSI:
-             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-             hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way_update( &ctx.hamsi, vhash, size );
-             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            if ( i == 0 )
+               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               hamsi512_4way_init( &ctx.hamsi );
+               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_4way_close( &ctx.hamsi, vhash );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
@@ -785,39 +1018,77 @@ void x16rv2_4way_hash( void* output, const void* input )
          break;
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
-             shabal512_4way_init( &ctx.shabal );
-             shabal512_4way_update( &ctx.shabal, vhash, size );
+             if ( i == 0 )
+                shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+             else
+             {
+                shabal512_4way_init( &ctx.shabal );
+                shabal512_4way_update( &ctx.shabal, vhash, size );
+             }
              shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
+          break;
+          case WHIRLPOOL:
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in0, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in1, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in2, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in3, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
          break;
          case SHA_512:
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in0, size );
-             sph_tiger_close( &ctx.tiger, hash0 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in1, size );
-             sph_tiger_close( &ctx.tiger, hash1 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in2, size );
-             sph_tiger_close( &ctx.tiger, hash2 );
-             sph_tiger_init( &ctx.tiger );
-             sph_tiger( &ctx.tiger, in3, size );
-             sph_tiger_close( &ctx.tiger, hash3 );
-
+             if ( i == 0 )
+             {
+                sph_tiger( &ctx.tiger, in0 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in1 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in2 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) );
+                sph_tiger( &ctx.tiger, in3 + 64, 16 );
+                sph_tiger_close( &ctx.tiger, hash3 );
+             }
+             else
+             {
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in0, size );
+                sph_tiger_close( &ctx.tiger, hash0 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in1, size );
+                sph_tiger_close( &ctx.tiger, hash1 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in2, size );
+                sph_tiger_close( &ctx.tiger, hash2 );
+                sph_tiger_init( &ctx.tiger );
+                sph_tiger( &ctx.tiger, in3, size );
+                sph_tiger_close( &ctx.tiger, hash3 );
+             }
              for ( int i = (24/4); i < (64/4); i++ )
                 hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
  
@@ -841,20 +1112,21 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
 {
    uint32_t hash[4*16] __attribute__ ((aligned (64)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t bedata1[2] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   int thr_id = mythr->id; 
+   const int thr_id = mythr->id; 
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-      ptarget[7] = 0x0fff;
+   if ( bench )  ptarget[7] = 0x0fff;
    
-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
 
    bedata1[0] = bswap_32( pdata[1] );
    bedata1[1] = bswap_32( pdata[2] );
@@ -867,25 +1139,74 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
               applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
    }
 
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         jh512_4way_init( &x16rv2_ctx.jh );
+         jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 );
+      break;
+      case KECCAK:
+      case LUFFA:
+      case SHA_512:
+         mm128_bswap32_80( edata, pdata );
+         sph_tiger_init( &x16rv2_ctx.tiger );
+         sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case SKEIN:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         skein512_4way_init( &x16rv2_ctx.skein );
+         skein512_4way_update( &x16rv2_ctx.skein, vdata, 64 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         hamsi512_4way_init( &x16rv2_ctx.hamsi );
+         hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
+         shabal512_4way_init( &x16rv2_ctx.shabal );
+         shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
+         rintrlv_4x32_4x64( vdata, vdata32, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16rv2_ctx.whirlpool );
+         sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   }
+
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+
    do
    {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
       x16rv2_4way_hash( hash, vdata );
-      pdata[19] = n;
-
       for ( int i = 0; i < 4; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-   } while ( likely( (  n < max_nonce ) && !(*restart) ) );
-
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c
index 1dc9cee..e84163c 100644
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -17,6 +17,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -58,7 +59,8 @@ union _x21s_8way_context_overlay
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
-    cube_4way_context       cube;
+    cubehashParam           cube;
+//    cube_4way_context       cube;
     simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -82,18 +84,21 @@ union _x21s_8way_context_overlay
 
 typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
 
+static __thread x21s_8way_context_overlay x21s_ctx;
+
 void x21s_8way_hash( void* output, const void* input )
 {
-   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t hash4[24] __attribute__ ((aligned (64)));
-   uint32_t hash5[24] __attribute__ ((aligned (64)));
-   uint32_t hash6[24] __attribute__ ((aligned (64)));
-   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[20] __attribute__ ((aligned (64)));
+   uint32_t hash1[20] __attribute__ ((aligned (64)));
+   uint32_t hash2[20] __attribute__ ((aligned (64)));
+   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t hash4[20] __attribute__ ((aligned (64)));
+   uint32_t hash5[20] __attribute__ ((aligned (64)));
+   uint32_t hash6[20] __attribute__ ((aligned (64)));
+   uint32_t hash7[20] __attribute__ ((aligned (64)));
    x21s_8way_context_overlay ctx;
+   memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -115,18 +120,16 @@ void x21s_8way_hash( void* output, const void* input )
       switch ( algo )
       {
          case BLAKE:
-            blake512_8way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_8way_update( &ctx.blake, input, size );
+               blake512_8way_full( &ctx.blake, vhash, input, size );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-               blake512_8way_update( &ctx.blake, vhash, size );
+               blake512_8way_full( &ctx.blake, vhash, vhash, size );
             }
-            blake512_8way_close( &ctx.blake, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
+                                 hash6, hash7, vhash );
          break;
          case BMW:
             bmw512_8way_init( &ctx.bmw );
@@ -145,62 +148,30 @@ void x21s_8way_hash( void* output, const void* input )
          case GROESTL:
 #if defined(__VAES__)
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                              (const char*)in0, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                              (const char*)in1, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                              (const char*)in2, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                              (const char*)in3, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                              (const char*)in4, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                              (const char*)in5, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                              (const char*)in6, size<<3 );
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                              (const char*)in7, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 );
 #endif
-               break;
-         case SKEIN:
-            skein512_8way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
          break;
          case JH:
-            jh512_8way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input, size );
+               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
             else
             {
                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
+               jh512_8way_init( &ctx.jh );
                jh512_8way_update( &ctx.jh, vhash, size );
             }
             jh512_8way_close( &ctx.jh, vhash );
@@ -221,25 +192,97 @@ void x21s_8way_hash( void* output, const void* input )
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_init( &ctx.skein );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
          case LUFFA:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            luffa_4way_init( &ctx.luffa, 512 );
-            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            if ( i == 0 )
+            {
+                intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+                luffa_4way_update_close( &ctx.luffa, vhash,
+                                                     vhash + (16<<2), 16 );
+                dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+                memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+                intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+                luffa_4way_update_close( &ctx.luffa, vhash,
+                                                     vhash + (16<<2), 16 );
+                dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            }
+            else
+            {
+               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            }
          break;
          case CUBEHASH:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            cube_4way_init( &ctx.cube, 512, 16, 32 );
-            cube_4way_update_close( &ctx.cube, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                            (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                            (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                            (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                            (const byte*)in3 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
+                                            (const byte*)in4 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
+                                            (const byte*)in5 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
+                                            (const byte*)in6 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
+                                            (const byte*)in7 + 64, 16 );
+            }
+            else
+            {
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                             (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                             (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                             (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                             (const byte*)in3, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
+                                             (const byte*)in4, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
+                                             (const byte*)in5, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
+                                             (const byte*)in6, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
+                                             (const byte*)in7, size );
+            }
          break;
          case SHAVITE:
 #if defined(__VAES__)
@@ -277,64 +320,56 @@ void x21s_8way_hash( void* output, const void* input )
             sph_shavite512( &ctx.shavite, in7, size );
             sph_shavite512_close( &ctx.shavite, hash7 );
 #endif
-            break;
+         break;
          case SIMD:
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd_4way_init( &ctx.simd, 512 );
-            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            simd512_4way_full( &ctx.simd, vhash, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case ECHO:
 #if defined(__VAES__)
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
             dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
             intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            echo_4way_init( &ctx.echo, 512 );
-            echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 );
+            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
 #else
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                               (const BitSequence*)in0, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                               (const BitSequence*)in1, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                               (const BitSequence*)in2, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                               (const BitSequence*)in3, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                               (const BitSequence*)in4, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                               (const BitSequence*)in5, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                               (const BitSequence*)in6, size<<3 );
-            init_echo( &ctx.echo, 512 );
-            update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                               (const BitSequence*)in7, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
+            echo_full( &ctx.echo, (BitSequence *)hash4, 512,
+                              (const BitSequence *)in4, size );
+            echo_full( &ctx.echo, (BitSequence *)hash5, 512,
+                              (const BitSequence *)in5, size );
+            echo_full( &ctx.echo, (BitSequence *)hash6, 512,
+                              (const BitSequence *)in6, size );
+            echo_full( &ctx.echo, (BitSequence *)hash7, 512,
+                              (const BitSequence *)in7, size );
 #endif
-             break;
+         break;
          case HAMSI:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+            if ( i == 0 )
+               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
-
-             hamsi512_8way_init( &ctx.hamsi );
-             hamsi512_8way_update( &ctx.hamsi, vhash, size );
-             hamsi512_8way_close( &ctx.hamsi, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+               hamsi512_8way_init( &ctx.hamsi );
+               hamsi512_8way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_8way_close( &ctx.hamsi, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
-             break;
+         break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
              sph_fugue512( &ctx.fugue, in0, size );
@@ -363,48 +398,87 @@ void x21s_8way_hash( void* output, const void* input )
          break;
          case SHABAL:
              intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                          size<<3 );
-             shabal512_8way_init( &ctx.shabal );
-             shabal512_8way_update( &ctx.shabal, vhash, size );
+                             size<<3 );
+             if ( i == 0 )
+                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
+             else
+             {
+                shabal512_8way_init( &ctx.shabal );
+                shabal512_8way_update( &ctx.shabal, vhash, size );
+             }
              shabal512_8way_close( &ctx.shabal, vhash );
              dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                           hash7, vhash );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in4, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash4 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in5, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash5 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in6, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash6 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in7, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash4 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash5 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash6 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in0, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in1, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in2, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in3, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in4, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash4 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in5, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash5 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in6, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash6 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in7, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash7 );
+            }
          break;
          case SHA_512:
-             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
              sha512_8way_init( &ctx.sha512 );
-             sha512_8way_update( &ctx.sha512, vhash, size );
+             if ( i == 0 )
+                sha512_8way_update( &ctx.sha512, input, size );
+             else
+             {
+                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                             size<<3 );
+                sha512_8way_update( &ctx.sha512, vhash, size );
+             }
              sha512_8way_close( &ctx.sha512, vhash );
              dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
+                               hash7, vhash );
+          break;
       }
       size = 64;
    }
@@ -492,8 +566,10 @@ void x21s_8way_hash( void* output, const void* input )
 int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t hash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &hash[7<<3];
    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t bedata1[2] __attribute__((aligned(64)));
@@ -503,14 +579,12 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    const uint32_t last_nonce = max_nonce - 16;
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
 
-   if ( opt_benchmark )
-    ptarget[7] = 0x0cff;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   if ( bench )   ptarget[7] = 0x0cff;
 
    bedata1[0] = bswap_32( pdata[1] );
    bedata1[1] = bswap_32( pdata[2] );
@@ -523,28 +597,81 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
               applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
    }
 
+   // Do midstate prehash on hash functions with block size <= 64 bytes.
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+   switch ( algo )
+   {
+      case JH:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         jh512_8way_init( &x21s_ctx.jh );
+         jh512_8way_update( &x21s_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         skein512_8way_init( &x21s_ctx.skein );
+         skein512_8way_update( &x21s_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
+         luffa_4way_init( &x21s_ctx.luffa, 512 );
+         luffa_4way_update( &x21s_ctx.luffa, vdata2, 64 );
+         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x21s_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x21s_ctx.cube, (const byte*)edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         hamsi512_8way_init( &x21s_ctx.hamsi );
+         hamsi512_8way_update( &x21s_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
+         shabal512_8way_init( &x21s_ctx.shabal );
+         shabal512_8way_update( &x21s_ctx.shabal, vdata2, 64 );
+         rintrlv_8x32_8x64( vdata, vdata2, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x21s_ctx.whirlpool );
+         sph_whirlpool( &x21s_ctx.whirlpool, edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   }
+
+   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
+                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
+
+ 
    do
    {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                 n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-
       x21s_8way_hash( hash, vdata );
-      pdata[19] = n;
 
       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hash7[lane] <= Htarg ) )
       {
          extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
          {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
       }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
       n += 8;
    } while ( (  n < last_nonce ) && !(*restart) );
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
@@ -573,6 +700,7 @@ union _x21s_4way_context_overlay
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
+    hashState_luffa         luffa1;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
     simd_2way_context       simd;
@@ -589,17 +717,20 @@ union _x21s_4way_context_overlay
 #else
     sha256_4way_context     sha256;
 #endif
-};
+} __attribute__ ((aligned (64)));
 typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;
 
+static __thread x21s_4way_context_overlay x21s_ctx;
+
 void x21s_4way_hash( void* output, const void* input )
 {
-   uint32_t hash0[24] __attribute__ ((aligned (64)));
-   uint32_t hash1[24] __attribute__ ((aligned (64)));
-   uint32_t hash2[24] __attribute__ ((aligned (64)));
-   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[20] __attribute__ ((aligned (64)));
+   uint32_t hash1[20] __attribute__ ((aligned (64)));
+   uint32_t hash2[20] __attribute__ ((aligned (64)));
+   uint32_t hash3[20] __attribute__ ((aligned (64)));
+   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
    x21s_4way_context_overlay ctx;
+   memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
    void *in0 = (void*) hash0;
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
@@ -624,15 +755,13 @@ void x21s_4way_hash( void* output, const void* input )
       switch ( algo )
       {
          case BLAKE:
-            blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way_update( &ctx.blake, input, size );
+               blake512_4way_full( &ctx.blake, vhash, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_update( &ctx.blake, vhash, size );
+               blake512_4way_full( &ctx.blake, vhash, vhash, size );
             }
-            blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case BMW:
@@ -648,38 +777,18 @@ void x21s_4way_hash( void* output, const void* input )
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-         break;
-         case SKEIN:
-            skein512_4way_init( &ctx.skein );
-            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_update( &ctx.skein, vhash, size );
-            }
-            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
          break;
          case JH:
-            jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way_init( &ctx.jh );
                jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
@@ -697,29 +806,74 @@ void x21s_4way_hash( void* output, const void* input )
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
+         case SKEIN:
+            if ( i == 0 )
+               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way_init( &ctx.skein );
+               skein512_4way_update( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
          case LUFFA:
-            intrlv_2x128( vhash, in0, in1, size<<3 );
-            luffa_2way_init( &ctx.luffa, 512 );
-            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
-            intrlv_2x128( vhash, in2, in3, size<<3 );
-            luffa_2way_init( &ctx.luffa, 512 );
-            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            if ( i == 0 )
+            {
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash0,
+                                    (const BitSequence*)in0 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash1,
+                                    (const BitSequence*)in1 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );  
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash2,
+                                    (const BitSequence*)in2 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );  
+               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash3,
+                                    (const BitSequence*)in3 + 64, 16 );
+            }
+            else
+            {
+               intrlv_2x128( vhash, in0, in1, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_2x128_512( hash0, hash1, vhash );
+               intrlv_2x128( vhash, in2, in3, size<<3 );
+               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
+               dintrlv_2x128_512( hash2, hash3, vhash );
+            }
          break;
          case CUBEHASH:
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                  (const byte*)in0, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                  (const byte*)in1, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                  (const byte*)in2, size );
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                  (const byte*)in3, size );
+            if ( i == 0 )
+            {
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                          (const byte*)in0 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                          (const byte*)in1 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                          (const byte*)in2 + 64, 16 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                          (const byte*)in3 + 64, 16 );
+
+            }
+            else
+            {   
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                          (const byte*)in0, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                     (const byte*)in1, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                     (const byte*)in2, size );
+               cubehashInit( &ctx.cube, 512, 16, 32 );
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                     (const byte*)in3, size );
+            }
          break;
          case SHAVITE:
             sph_shavite512_init( &ctx.shavite );
@@ -746,25 +900,26 @@ void x21s_4way_hash( void* output, const void* input )
             dintrlv_2x128( hash2, hash3, vhash, 512 );
          break;
          case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
+            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
+                              (const BitSequence *)in0, size );
+            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
+                              (const BitSequence *)in1, size );
+            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
+                              (const BitSequence *)in2, size );
+            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
+                              (const BitSequence *)in3, size );
          break;
          case HAMSI:
-             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-             hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way_update( &ctx.hamsi, vhash, size );
-             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            if ( i == 0 )
+               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               hamsi512_4way_init( &ctx.hamsi );
+               hamsi512_4way_update( &ctx.hamsi, vhash, size );
+            }
+            hamsi512_4way_close( &ctx.hamsi, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
@@ -781,32 +936,59 @@ void x21s_4way_hash( void* output, const void* input )
              sph_fugue512_close( &ctx.fugue, hash3 );
          break;
          case SHABAL:
-             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
-             shabal512_4way_init( &ctx.shabal );
-             shabal512_4way_update( &ctx.shabal, vhash, size );
-             shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+            intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
+            if ( i == 0 )
+               shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
+            else
+            {
+               shabal512_4way_init( &ctx.shabal );
+               shabal512_4way_update( &ctx.shabal, vhash, size );
+            }
+            shabal512_4way_close( &ctx.shabal, vhash );
+            dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in0, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash0 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in1, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash1 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in2, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash2 );
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in3, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            if ( i == 0 )
+            {
+               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
+               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in0, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash0 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in1, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash1 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in2, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash2 );
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in3, size );
+               sph_whirlpool_close( &ctx.whirlpool, hash3 );
+            }
          break;
          case SHA_512:
-             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-             sha512_4way_init( &ctx.sha512 );
-             sha512_4way_update( &ctx.sha512, vhash, size );
-             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            sha512_4way_init( &ctx.sha512 );
+            if ( i == 0 )
+               sha512_4way_update( &ctx.sha512, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               sha512_4way_update( &ctx.sha512, vhash, size );
+            }
+            sha512_4way_close( &ctx.sha512, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
       }
       size = 64;
@@ -889,23 +1071,23 @@ void x21s_4way_hash( void* output, const void* input )
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t hash[4*16] __attribute__ ((aligned (64)));
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t bedata1[2] __attribute__((aligned(64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   int thr_id = mythr->id; 
+   const int thr_id = mythr->id; 
+   const bool bench = opt_benchmark;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
 
-   if ( opt_benchmark )
-    ptarget[7] = 0x0cff;
+   if ( bench )  ptarget[7] = 0x0cff;
  
-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-
    bedata1[0] = bswap_32( pdata[1] );
    bedata1[1] = bswap_32( pdata[2] );
    uint32_t ntime = bswap_32( pdata[17] );
@@ -916,25 +1098,73 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
       if ( opt_debug && !thr_id )
               applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
    }
+   
+   const char elem = hashOrder[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+   switch ( algo )
+   {
+      case JH:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         jh512_4way_init( &x21s_ctx.jh );
+         jh512_4way_update( &x21s_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         skein512_4way_init( &x21s_ctx.skein );
+         skein512_4way_update( &x21s_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         init_luffa( &x21s_ctx.luffa1, 512 );
+         update_luffa( &x21s_ctx.luffa1, (const BitSequence*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x21s_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x21s_ctx.cube, (const byte*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         hamsi512_4way_init( &x21s_ctx.hamsi );
+         hamsi512_4way_update( &x21s_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
+         shabal512_4way_init( &x21s_ctx.shabal );
+         shabal512_4way_update( &x21s_ctx.shabal, vdata32, 64 );
+         rintrlv_4x32_4x64( vdata, vdata32, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x21s_ctx.whirlpool );
+         sph_whirlpool( &x21s_ctx.whirlpool, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   }
+
+   *noncev = mm256_intrlv_blend_32(
+                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
 
    do
    {
-      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-
       x21s_4way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
-      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      for ( int i = 0; i < 4; i++ )
+      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
       {
-         pdata[19] = n+i;
+         pdata[19] = bswap_32( n+i );
          submit_lane_solution( work, hash+(i<<3), mythr, i );
       }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
       n += 4;
-   } while ( (  n < max_nonce ) && !(*restart) );
-
-   *hashes_done = n - first_nonce + 1;
+   } while ( (  n < last_nonce ) && !(*restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index d2bbdd0..e796321 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -310,10 +310,10 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
       x17_8way_hash( hash, vdata );
 
       for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( ( hash7[ lane ] <= Htarg ) && !bench )
+      if ( unlikely( ( hash7[ lane ] <= Htarg ) && !bench ) )
       {
          extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if likely( valid_hash( lane_hash, ptarget ) )
+         if ( likely( valid_hash( lane_hash, ptarget ) ) )
          {
             pdata[19] = bswap_32( n + lane );
             submit_lane_solution( work, lane_hash, mythr, lane );
@@ -323,7 +323,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
                                   m512_const1_64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
+   pdata[19] = n;
    *hashes_done = n - first_nonce;
    return 0;
 }
diff --git a/algo/x17/x17.c b/algo/x17/x17.c
index bb29850..95c30a3 100644
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -71,9 +71,7 @@ void x17_hash(void *output, const void *input)
     sph_bmw512_close(&ctx.bmw, hash);
 
 #if defined(__AES__)
-    init_groestl( &ctx.groestl, 64 );
-    update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                      (const char*)hash, 512 );
+    groestl512_full( &ctx.groestl, (char*)hash, (const char*)hash, 512 );
 #else
     sph_groestl512_init( &ctx.groestl );
     sph_groestl512( &ctx.groestl, hash, 64 );
@@ -92,14 +90,11 @@ void x17_hash(void *output, const void *input)
     sph_keccak512(&ctx.keccak, (const void*) hash, 64);
     sph_keccak512_close(&ctx.keccak, hash);
 
-    init_luffa( &ctx.luffa, 512 );
-    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                  (const BitSequence*)hash, 64 );
+    luffa_full( &ctx.luffa, (BitSequence*)hash, 512,
+                            (const BitSequence*)hash, 64 );
 
     // 8 Cube
-    cubehashInit( &ctx.cube, 512, 16, 32 );
-    cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                (const byte*)hash, 64 );
+    cubehash_full( &ctx.cube, (byte*) hash, 512, (const byte*)hash, 64 );
 
     // 9 Shavite
     sph_shavite512_init( &ctx.shavite );
@@ -107,15 +102,13 @@ void x17_hash(void *output, const void *input)
     sph_shavite512_close( &ctx.shavite, hash);
 
     // 10 Simd
-    init_sd( &ctx.simd, 512 );
-    update_final_sd( &ctx.simd, (BitSequence*)hash,
+    simd_full( &ctx.simd, (BitSequence*)hash,
                           (const BitSequence*)hash, 512 );
 
     //11---echo---
 #if defined(__AES__)
-    init_echo( &ctx.echo, 512 );
-    update_final_echo ( &ctx.echo, (BitSequence*)hash,
-                             (const BitSequence*)hash, 512 );
+    echo_full( &ctx.echo, (BitSequence *)hash, 512,
+                    (const BitSequence *)hash, 64 );
 #else
     sph_echo512_init( &ctx.echo );
     sph_echo512( &ctx.echo, hash, 64 );
@@ -161,28 +154,8 @@ int scanhash_x17( struct work *work, uint32_t max_nonce,
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[19] - 1;
    const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
-   uint64_t htmax[] =
-   {
-	0,
-	0xF,
-	0xFF,
-	0xFFF,
-	0xFFFF,
-	0x10000000
-   };
-   uint32_t masks[] =
-   {
-	0xFFFFFFFF,
-	0xFFFFFFF0,
-	0xFFFFFF00,
-	0xFFFFF000,
-	0xFFFF0000,
-	0
-   };
-
    // we need bigendian data...
    casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
    casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -190,23 +163,14 @@ int scanhash_x17( struct work *work, uint32_t max_nonce,
    casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
    casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
 
-   for ( int m = 0; m < 6; m++ )
+   do
    {
-      if ( Htarg <= htmax[m] )
-	   {
-	      uint32_t mask = masks[m];
-	      do
-	      {
-	         pdata[19] = ++n;
-		      be32enc( &endiandata[19], n );
-		      x17_hash( hash64, endiandata );
-		      if ( !( hash64[7] & mask ) )
-            if ( fulltest( hash64, ptarget ) && !opt_benchmark )
-                submit_solution( work, hash64, mythr );
-	      } while ( n < max_nonce && !work_restart[thr_id].restart);
-	      break;
-	   }
-   }
+      pdata[19] = ++n;
+      be32enc( &endiandata[19], n );
+      x17_hash( hash64, endiandata );
+      if unlikely( valid_hash( hash64, ptarget ) && !opt_benchmark )
+             submit_solution( work, hash64, mythr );
+   } while ( n < max_nonce && !work_restart[thr_id].restart);
    *hashes_done = n - first_nonce + 1;
    pdata[19] = n;
    return 0;
diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c
index 5b9e2be..c080763 100644
--- a/algo/yespower/yescrypt-r8g.c
+++ b/algo/yespower/yescrypt-r8g.c
@@ -73,7 +73,8 @@ bool register_yescryptr8g_algo( algo_gate_t* gate )
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash      = (void*)&scanhash_yespower_r8g;
   gate->hash          = (void*)&yespower_tls;
-  opt_target_factor = 65536.0;
+  opt_sapling         = true;
+  opt_target_factor   = 65536.0;
   return true;
  };
 
diff --git a/configure b/configure
index 3dce50f..3f7e8f2 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.7.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.8.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.11.7'
-PACKAGE_STRING='cpuminer-opt 3.11.7'
+PACKAGE_VERSION='3.11.8'
+PACKAGE_STRING='cpuminer-opt 3.11.8'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.11.7 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.11.8 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.11.7:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.11.8:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.11.7
+cpuminer-opt configure 3.11.8
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.11.7, which was
+It was created by cpuminer-opt $as_me 3.11.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.11.7'
+ VERSION='3.11.8'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.11.7, which was
+This file was extended by cpuminer-opt $as_me 3.11.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.11.7
+cpuminer-opt config.status 3.11.8
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 2bf3d8e..5d8771d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.11.7])
+AC_INIT([cpuminer-opt], [3.11.8])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index b6edca2..7e1f094 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -110,6 +110,7 @@ int opt_param_r = 0;
 int opt_pluck_n = 128;
 int opt_n_threads = 0;
 bool opt_reset_on_stale = false;
+bool opt_sapling = false;
 
 // Windows doesn't support 128 bit affinity mask.
 // Need compile time and run time test.
@@ -551,10 +552,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
       goto out;
    }
    version = (uint32_t) json_integer_value( tmp );
-   if ( version == 5 )
+
+   // yescryptr8g uses block version 5 and sapling.
+   if ( opt_sapling )
       work->sapling = true;
-   else if ( version > 4 )
-//   if ( (version & 0xffU) > BLOCK_VERSION_CURRENT )
+   if ( (version & 0xffU) > BLOCK_VERSION_CURRENT )
    {
       if ( version_reduce )
          version = ( version & ~0xffU ) | BLOCK_VERSION_CURRENT;
@@ -1057,7 +1059,7 @@ static int share_result( int result, struct work *null_work,
    }
 
    // calculate latency and share time.
-   if ( my_stats.submit_time.tv_sec )
+   if likely( my_stats.submit_time.tv_sec )
    {
       gettimeofday( &ack_time, NULL );
       timeval_subtract( &latency_tv, &ack_time, &my_stats.submit_time );
@@ -1075,7 +1077,8 @@ static int share_result( int result, struct work *null_work,
    if ( likely( result ) )
    {
       accepted_share_count++;
-      if ( ( my_stats.net_diff > 0. ) && ( my_stats.share_diff >= net_diff ) )
+      if unlikely( ( my_stats.net_diff > 0. )
+                && ( my_stats.share_diff >= net_diff ) )
       {
          solved = true;
          solved_block_count++;
@@ -1106,17 +1109,14 @@ static int share_result( int result, struct work *null_work,
    }
    else
    {
-      if ( stale )
-         stale_sum++;
-      else
-         reject_sum++;
+      if ( stale )  stale_sum++;
+      else          reject_sum++;
    }
    submit_sum++;
    latency_sum += latency;
 
    pthread_mutex_unlock( &stats_lock );
 
-   bcol = acol = scol = rcol = "\0";
    if ( likely( result ) )
    {
      if ( unlikely( solved ) )
@@ -1148,25 +1148,19 @@ static int share_result( int result, struct work *null_work,
      }
    } 
 
-   bcol = acol = scol = rcol = CL_WHT;
-
    if ( use_colors )
    {
+     bcol = acol = scol = rcol = CL_WHT;
      if ( likely( result ) )
      {
-       if ( unlikely( solved ) )
-       {
-         bcol = CL_MAG;
-         acol = CL_GRN;
-       }
-       else
-         acol = CL_GRN; 
+       acol = CL_GRN;  
+       if ( unlikely( solved ) ) bcol = CL_MAG;
      }        
-     else if ( stale )
-       scol = CL_YL2;
-     else
-       rcol = CL_RED;
+     else if ( stale ) scol = CL_YL2;
+     else              rcol = CL_RED;
    }
+   else
+      bcol = acol = scol = rcol = "\0";
 
    applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)",
            my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
@@ -1180,31 +1174,29 @@ static int share_result( int result, struct work *null_work,
    if ( unlikely( reason && !result ) )
    {
       if ( !( opt_quiet || stale ) )
+      {
          applog( LOG_WARNING, "Reject reason: %s", reason );
       
-      if ( opt_debug )
-      {
          uint32_t str1[8], str2[8];
          char str3[65];
 
          // display share hash and target for troubleshooting
-         diff_to_target( (uint64_t*)str1, my_stats.share_diff );
+         diff_to_target( str1, my_stats.share_diff );
          for ( int i = 0; i < 8; i++ )
             be32enc( str2 + i, str1[7 - i] );
          bin2hex( str3, (unsigned char*)str2, 12 );
-         applog2( LOG_INFO, "Hash:   %s...", str3 );
+         applog2( LOG_INFO, "Share diff:  %g, Hash: %s...", my_stats.share_diff, str3 );
 
-         diff_to_target( (uint64_t*)str1, my_stats.target_diff );
+         diff_to_target( str1, my_stats.target_diff );
          for ( int i = 0; i < 8; i++ )
             be32enc( str2 + i, str1[7 - i] );
          bin2hex( str3, (unsigned char*)str2, 12 );
-         applog2( LOG_INFO, "Target: %s...", str3 );
+         applog2( LOG_INFO, "Target diff: %g, Targ: %s...", str3 );
       }
 
       if ( unlikely( opt_reset_on_stale && stale ) )
          stratum_need_reset = true;
    }
-
    return 1;
 }
 
@@ -1265,7 +1257,7 @@ bool std_le_submit_getwork_result( CURL *curl, struct work *work )
    for ( int i = 0; i < data_size / sizeof(uint32_t); i++ )
      le32enc( &work->data[i], work->data[i] );
    gw_str = abin2hex( (uchar*)work->data, data_size );
-   if ( unlikely(!gw_str) )
+   if ( unlikely( !gw_str ) )
    {
       applog(LOG_ERR, "submit_upstream_work OOM");
       return false;
@@ -1299,7 +1291,7 @@ bool std_be_submit_getwork_result( CURL *curl, struct work *work )
    for ( int i = 0; i < data_size / sizeof(uint32_t); i++ )
      be32enc( &work->data[i], work->data[i] );
    gw_str = abin2hex( (uchar*)work->data, data_size );
-   if ( unlikely(!gw_str) )
+   if ( unlikely( !gw_str ) )
    {
       applog(LOG_ERR, "submit_upstream_work OOM");
       return false;
@@ -1755,7 +1747,7 @@ static bool get_work(struct thr_info *thr, struct work *work)
 	struct workio_cmd *wc;
 	struct work *work_heap;
 
-	if (opt_benchmark)
+	if unlikely( opt_benchmark )
    {
 		uint32_t ts = (uint32_t) time(NULL);
 
@@ -2020,8 +2012,8 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id,
      work_free( work );
      work_copy( work, g_work );
      *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
-     if ( opt_randomize )
-       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
+//     if ( opt_randomize )
+//       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
      *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
    }
    else
@@ -2214,7 +2206,7 @@ static void *miner_thread( void *userdata )
 	       continue;
        }
        // adjust max_nonce to meet target scan time
-       if (have_stratum)
+       if ( have_stratum )
           max64 = LP_SCANTIME;
        else
           max64 = g_work_time + ( have_longpoll ? LP_SCANTIME : opt_scantime )
@@ -2294,7 +2286,7 @@ static void *miner_thread( void *userdata )
 
           // prevent stale work in solo
           // we can't submit twice a block!
-          if ( !have_stratum && !have_longpoll )
+          if unlikely( !have_stratum && !have_longpoll )
           {
              pthread_mutex_lock( &g_work_lock );
              // will force getwork
@@ -2598,20 +2590,17 @@ void std_build_block_header( struct work* g_work, uint32_t version,
 
    memset( g_work->data, 0, sizeof(g_work->data) );
    g_work->data[0] = version;
-   g_work->sapling = be32dec( &version ) == 5 ? true : false;
+   g_work->sapling = opt_sapling;
 
-   if ( have_stratum )
-      for ( i = 0; i < 8; i++ )
+   if ( have_stratum ) for ( i = 0; i < 8; i++ )
          g_work->data[ 1+i ] = le32dec( prevhash + i );
-   else
-      for (i = 0; i < 8; i++)
+   else for (i = 0; i < 8; i++)
          g_work->data[ 8-i ] = le32dec( prevhash + i );
-
    for ( i = 0; i < 8; i++ )
       g_work->data[ 9+i ] = be32dec( merkle_tree + i );
-
    g_work->data[ algo_gate.ntime_index ] = ntime;
    g_work->data[ algo_gate.nbits_index ] = nbits;
+
    if ( g_work->sapling )
    {
       if ( have_stratum )
@@ -2653,7 +2642,6 @@ void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
 {
    pthread_mutex_lock( &sctx->work_lock );
-
    free( g_work->job_id );
    g_work->job_id = strdup( sctx->job.job_id );
    g_work->xnonce2_len = sctx->xnonce2_size;
@@ -2690,7 +2678,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
    else if ( last_block_height != sctx->block_height )
       applog( LOG_BLUE, "New block %d, job %s",
                          sctx->block_height, g_work->job_id );
-   else
+   else if ( g_work->job_id ) 
       applog( LOG_BLUE,"New job %s", g_work->job_id );
 
    // Update data and calculate new estimates.
@@ -2710,6 +2698,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
          applog2( LOG_INFO, "%s: %s", algo_names[opt_algo], short_url );
          applog2( LOG_INFO, "Diff: Net %.3g, Stratum %.3g, Target %.3g",
                             net_diff, stratum_diff, last_targetdiff );
+
          if ( likely( hr > 0. ) )
          {
             char hr_units[4] = {0};
@@ -2719,26 +2708,25 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
             sprintf_et( block_ttf, net_diff * diff_to_hash / hr );
             sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr );
             scale_hash_for_display ( &hr, hr_units );
- 
             applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
                                hr, hr_units, block_ttf, share_ttf );
-            if ( !multipool && net_diff > 0. )
+
+            if ( !multipool && last_block_height > session_first_block )
             {
                struct timeval now, et;
                gettimeofday( &now, NULL );
                timeval_subtract( &et, &now, &session_start );
-               double net_hr = net_diff * diff_to_hash;
-               char net_ttf[32];
+               uint64_t net_ttf =
+                    ( last_block_height - session_first_block ) == 0 ? 0
+                    : et.tv_sec / ( last_block_height - session_first_block );
+               double net_hr = net_diff * diff_to_hash / net_ttf;
+               char net_ttf_str[32];
                char net_hr_units[4] = {0};
 
-               sprintf_et( net_ttf,
-                   ( last_block_height - session_first_block ) == 0 ? 0 :
-                     et.tv_sec / ( last_block_height - session_first_block ) );
-
+               sprintf_et( net_ttf_str, net_ttf );
                scale_hash_for_display ( &net_hr, net_hr_units );
-
-               applog2( LOG_INFO, "TTF @ %.2f %sh/s: %s",
-                                  net_hr, net_hr_units, net_ttf );
+               applog2( LOG_INFO, "Net TTF @ %.2f %sh/s: %s",
+                                  net_hr, net_hr_units, net_ttf_str );
             }
          }  // hr > 0
       } // !quiet
diff --git a/miner.h b/miner.h
index dbb3006..9629ee3 100644
--- a/miner.h
+++ b/miner.h
@@ -317,7 +317,7 @@ bool   valid_hash( const void*, const void* );
 
 void   work_set_target( struct work* work, double diff );
 double target_to_diff( uint32_t* target );
-extern void diff_to_target( uint64_t *target, double diff );
+extern void diff_to_target( uint32_t *target, double diff );
 
 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, const void *hash );
@@ -779,7 +779,7 @@ extern pthread_mutex_t rpc2_job_lock;
 extern pthread_mutex_t rpc2_login_lock;
 extern pthread_mutex_t applog_lock;
 extern pthread_mutex_t stats_lock;
-
+extern bool opt_sapling;
 
 static char const usage[] = "\
 Usage: " PACKAGE_NAME " [OPTIONS]\n\
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index 4ad8df4..0ca4f95 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -567,6 +567,20 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void  *src1,
    }
 }
 
+#if defined(__SSSE3__)
+
+static inline void mm128_bswap32_80( void *d, void *s )
+{
+  __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+  casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
+  casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf );
+  casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf );
+  casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf );
+  casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
+}
+
+#endif
+
 static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
   __m128i s0 = casti_m128i( src,0 );
@@ -2106,6 +2120,7 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,
    RLEAVE_4x64_4x32(  48 );   RLEAVE_4x64_4x32(  56 );
    if ( bit_len <= 512 ) return;
    RLEAVE_4x64_4x32(  64 );   RLEAVE_4x64_4x32(  72 );
+   if ( bit_len <= 640 ) return;
    RLEAVE_4x64_4x32(  80 );   RLEAVE_4x64_4x32(  88 );
    RLEAVE_4x64_4x32(  96 );   RLEAVE_4x64_4x32( 104 );
    RLEAVE_4x64_4x32( 112 );   RLEAVE_4x64_4x32( 120 );
@@ -2140,6 +2155,9 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src,
    if ( bit_len <= 512 ) return;
    
    RLEAVE_8x64_8x32( 128 );   RLEAVE_8x64_8x32( 144 );
+
+   if ( bit_len <= 640 ) return;
+
    RLEAVE_8x64_8x32( 160 );   RLEAVE_8x64_8x32( 176 );
    RLEAVE_8x64_8x32( 192 );   RLEAVE_8x64_8x32( 208 );
    RLEAVE_8x64_8x32( 224 );   RLEAVE_8x64_8x32( 240 );
@@ -2255,6 +2273,8 @@ static inline void rintrlv_8x32_8x64( void *dst,
    d[38] = _mm_unpacklo_epi32( s[37], s[39] );
    d[39] = _mm_unpackhi_epi32( s[37], s[39] );
 
+   if ( bit_len <= 640 ) return;
+   
    d[40] = _mm_unpacklo_epi32( s[40], s[42] );
    d[41] = _mm_unpackhi_epi32( s[40], s[42] );
    d[42] = _mm_unpacklo_epi32( s[41], s[43] );
@@ -2319,7 +2339,9 @@ static inline void rintrlv_8x32_4x128( void *dst0, void *dst1,
    if ( bit_len <= 256 ) return;
    RLEAVE_8X32_4X128(  32 );    RLEAVE_8X32_4X128(  48 );
    if ( bit_len <= 512 ) return;
-   RLEAVE_8X32_4X128(  64 );    RLEAVE_8X32_4X128(  80 );
+   RLEAVE_8X32_4X128(  64 );
+   if ( bit_len <= 640 ) return;
+   RLEAVE_8X32_4X128(  80 );
    RLEAVE_8X32_4X128(  96 );    RLEAVE_8X32_4X128( 112 );
 }
 #undef RLEAVE_8X32_4X128
@@ -2383,6 +2405,7 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
    d[17] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
    d[18] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
    d[19] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
+   if ( bit_len <= 640 ) return;
    d[20] = _mm_unpacklo_epi64( s0[10], s0[11] );
    d[21] = _mm_unpacklo_epi64( s1[10], s1[11] );
    d[22] = _mm_unpackhi_epi64( s0[10], s0[11] );
@@ -2453,6 +2476,7 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
    d0[ 9] = _mm_unpackhi_epi64( s[16], s[18] );
    d1[ 8] = _mm_unpacklo_epi64( s[17], s[19] );
    d1[ 9] = _mm_unpackhi_epi64( s[17], s[19] );
+   if ( bit_len <= 640 ) return;
    d0[10] = _mm_unpacklo_epi64( s[20], s[22] );
    d0[11] = _mm_unpackhi_epi64( s[20], s[22] );
    d1[10] = _mm_unpacklo_epi64( s[21], s[23] );
@@ -2549,6 +2573,8 @@ static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
    d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
    d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
 
+   if ( bit_len <= 640 ) return;
+   
    d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
    d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
    d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
@@ -2635,6 +2661,8 @@ static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
    d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
    d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
 
+   if ( bit_len <= 640 ) return;
+
    d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
    d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
    d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
@@ -2723,6 +2751,8 @@ static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
    d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
    d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
 
+   if ( bit_len <= 640 ) return;
+
    d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
    d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
    d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
@@ -2811,6 +2841,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
    d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
    d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
 
+   if ( bit_len <= 640 ) return;
+
    d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
    d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
    d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
diff --git a/util.c b/util.c
index d701389..635be39 100644
--- a/util.c
+++ b/util.c
@@ -1038,7 +1038,7 @@ bool fulltest( const uint32_t *hash, const uint32_t *target )
 	return rc;
 }
 
-void diff_to_target(uint64_t *target, double diff)
+void diff_to_target(uint32_t *target, double diff)
 {
 	uint64_t m;
 	int k;
@@ -1055,7 +1055,7 @@ void diff_to_target(uint64_t *target, double diff)
 	else
    {
 		memset( target, 0, 32 );
-      target[k] = m;
+      ((uint64_t*)target)[k] = m;
 //		target[k] = (uint32_t)m;
 //		target[k + 1] = (uint32_t)(m >> 32);
 	}
@@ -1064,7 +1064,7 @@ void diff_to_target(uint64_t *target, double diff)
 // Only used by stratum pools
 void work_set_target(struct work* work, double diff)
 {
-	diff_to_target( (uint64_t*)work->target, diff );
+	diff_to_target( work->target, diff );
 	work->targetdiff = diff;
 }
 
@@ -1574,8 +1574,9 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 		goto out;
 
 	if (!socket_full(sctx->sock, 3)) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "stratum extranonce subscribe timed out");
+         applog(LOG_WARNING, "stratum extranonce subscribe timed out");
+//		if (opt_debug)
+//			applog(LOG_DEBUG, "stratum extranonce subscribe timed out");
 		goto out;
 	}
 
@@ -1590,7 +1591,7 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 				if (!stratum_handle_method(sctx, sret))
 					applog(LOG_WARNING, "Stratum answer id is not correct!");
 			}
-//			res_val = json_object_get(extra, "result");
+			res_val = json_object_get(extra, "result");
 //			if (opt_debug && (!res_val || json_is_false(res_val)))
 //				applog(LOG_DEBUG, "extranonce subscribe not supported");
 			json_decref(extra);
@@ -1898,13 +1899,13 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	}
 
    hex2bin( sctx->job.version, version, 4 );
-   int ver = be32dec( sctx->job.version );
-   if ( ver == 5 )
+
+   if ( opt_sapling )
    {
       finalsaplinghash = json_string_value( json_array_get( params, 9 ) );
       if ( !finalsaplinghash || strlen(finalsaplinghash) != 64 )
       {
-         applog( LOG_ERR, "Stratum notify: invalid version 5 parameters" );
+         applog( LOG_ERR, "Stratum notify: invalid sapling parameters" );
          goto out;
       }
    }
@@ -1957,7 +1958,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	hex2bin( sctx->job.prevhash, prevhash, 32 );
    if ( has_claim ) hex2bin( sctx->job.extra, extradata, 32 );
    if ( has_roots ) hex2bin( sctx->job.extra, extradata, 64 );
-   if ( ver == 5 )
+   if ( opt_sapling )
       hex2bin( sctx->job.final_sapling_hash, finalsaplinghash, 32 );
 
    if ( is_veil )