v3.11.8

2025-09-17 23:44:27 +00:00 · 2020-01-30 03:47:11 -05:00
parent 88f81fda0b
commit 0681ca996d
46 changed files with 2882 additions and 10675 deletions
--- a/algo/groestl/aes_ni/groestl-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl-asm-aes.h
--- a/algo/groestl/aes_ni/groestl-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl-asm-avx.h
--- a/algo/groestl/aes_ni/groestl-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl-asm-vperm.h
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -11,17 +11,6 @@
 #include <wmmintrin.h>
 #include "hash-groestl.h"

-/* global constants  */
-__m128i ROUND_CONST_Lx;
-//__m128i ROUND_CONST_L0[ROUNDS512];
-//__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -111,7 +100,7 @@ __m128i ALL_FF;
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -152,24 +141,41 @@ __m128i ALL_FF;
 }/*MixBytes*/


-#define SET_CONSTANTS(){\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
-  for(i = 0; i < ROUNDS1024; i++)\
-  {\
-    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
-  }\
-}while(0);\
+static const uint64_t round_const_p[] __attribute__ ((aligned (64))) =
+{
+  0x7060504030201000, 0xf0e0d0c0b0a09080,
+  0x7161514131211101, 0xf1e1d1c1b1a19181,
+  0x7262524232221202, 0xf2e2d2c2b2a29282,
+  0x7363534333231303, 0xf3e3d3c3b3a39383,
+  0x7464544434241404, 0xf4e4d4c4b4a49484,
+  0x7565554535251505, 0xf5e5d5c5b5a59585,
+  0x7666564636261606, 0xf6e6d6c6b6a69686,
+  0x7767574737271707, 0xf7e7d7c7b7a79787,
+  0x7868584838281808, 0xf8e8d8c8b8a89888,
+  0x7969594939291909, 0xf9e9d9c9b9a99989,
+  0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a,
+  0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b,
+  0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c,
+  0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d
+};
+
+static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
+{
+  0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f,
+  0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e,
+  0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d,
+  0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c,
+  0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b,
+  0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a,
+  0x8999a9b9c9d9e9f9, 0x0919293949596979,
+  0x8898a8b8c8d8e8f8, 0x0818283848586878,
+  0x8797a7b7c7d7e7f7, 0x0717273747576777,
+  0x8696a6b6c6d6e6f6, 0x0616263646566676,
+  0x8595a5b5c5d5e5f5, 0x0515253545556575,
+  0x8494a4b4c4d4e4f4, 0x0414243444546474,
+  0x8393a3b3c3d3e3f3, 0x0313233343536373,
+  0x8292a2b2c2d2e2f2, 0x0212223242526272
+};

 /* one round
 * a0-a7 = input rows
@@ -194,30 +200,50 @@ __m128i ALL_FF;
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
+    xmm8 = _mm_xor_si128( xmm8, \
+             casti_m128i( round_const_p, round_counter ) ); \
     /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
-    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
+    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x0306090c0f020508, \
+                                                    0x0b0e0104070a0d00 ) ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x04070a0d00030609, \
+                                                    0x0c0f0205080b0e01 ) ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x05080b0e0104070a, \
+                                                    0x0d000306090c0f02 ) ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x06090c0f0205080b, \
+                                                    0x0e0104070a0d0003 ) ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x070a0d000306090c, \
+                                                    0x0f0205080b0e0104 ) ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x080b0e0104070a0d, \
+                                                    0x000306090c0f0205 ) ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x090c0f0205080b0e, \
+                                                    0x0104070a0d000306 ) ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x0e0104070a0d0003, \
+                                                    0x06090c0f0205080b ) ); \
    /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
+            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
    \
    /* AddRoundConstant P1024 */\
-    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    xmm0 = _mm_xor_si128( xmm0, \
+             casti_m128i( round_const_p, round_counter+1 ) ); \
+    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x0306090c0f020508, \
+                                                  0x0b0e0104070a0d00 ) ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x04070a0d00030609, \
+                                                  0x0c0f0205080b0e01 ) ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x05080b0e0104070a, \
+                                                  0x0d000306090c0f02 ) ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x06090c0f0205080b, \
+                                                  0x0e0104070a0d0003 ) ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x070a0d000306090c, \
+                                                  0x0f0205080b0e0104 ) ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x080b0e0104070a0d, \
+                                                  0x000306090c0f0205 ) ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x090c0f0205080b0e, \
+                                                  0x0104070a0d000306 ) ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x0e0104070a0d0003, \
+                                                  0x06090c0f0205080b ) ); \
+    SUBMIX( xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
+            xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
  }\
 }

@@ -225,48 +251,68 @@ __m128i ALL_FF;
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant Q1024 */\
-    xmm1 = ALL_FF;\
-    xmm8  = _mm_xor_si128(xmm8,  xmm1);\
-    xmm9  = _mm_xor_si128(xmm9,  xmm1);\
-    xmm10 = _mm_xor_si128(xmm10, xmm1);\
-    xmm11 = _mm_xor_si128(xmm11, xmm1);\
-    xmm12 = _mm_xor_si128(xmm12, xmm1);\
-    xmm13 = _mm_xor_si128(xmm13, xmm1);\
-    xmm14 = _mm_xor_si128(xmm14, xmm1);\
-    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
+    xmm1 = m128_neg1;\
+    xmm8  = _mm_xor_si128( xmm8,  xmm1 ); \
+    xmm9  = _mm_xor_si128( xmm9,  xmm1 ); \
+    xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
+    xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
+    xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
+    xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
+    xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
+    xmm15 = _mm_xor_si128( xmm15, \
+              casti_m128i( round_const_q, round_counter ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[1]));\
-    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[3]));\
-    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
-    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
-    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
-    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
-    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
-    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
+    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x04070a0d00030609, \
+                                                    0x0c0f0205080b0e01 ) ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x06090c0f0205080b, \
+                                                    0x0e0104070a0d0003 ) ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x080b0e0104070a0d, \
+                                                    0x000306090c0f0205 ) ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x0e0104070a0d0003, \
+                                                    0x06090c0f0205080b ) ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x0306090c0f020508, \
+                                                    0x0b0e0104070a0d00 ) ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x05080b0e0104070a, \
+                                                    0x0d000306090c0f02 ) ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x070a0d000306090c, \
+                                                    0x0f0205080b0e0104 ) ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x090c0f0205080b0e, \
+                                                    0x0104070a0d000306 ) ); \
    /* SubBytes + MixBytes */\
-    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
+            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6 , xmm7 ); \
    \
    /* AddRoundConstant Q1024 */\
-    xmm9 = ALL_FF;\
-    xmm0 = _mm_xor_si128(xmm0,  xmm9);\
-    xmm1 = _mm_xor_si128(xmm1,  xmm9);\
-    xmm2 = _mm_xor_si128(xmm2,  xmm9);\
-    xmm3 = _mm_xor_si128(xmm3,  xmm9);\
-    xmm4 = _mm_xor_si128(xmm4,  xmm9);\
-    xmm5 = _mm_xor_si128(xmm5,  xmm9);\
-    xmm6 = _mm_xor_si128(xmm6,  xmm9);\
-    xmm7 = _mm_xor_si128(xmm7,  (ROUND_CONST_Q[round_counter+1]));\
+    xmm9 = m128_neg1;\
+    xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
+    xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
+    xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
+    xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
+    xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
+    xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
+    xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
+    xmm7 = _mm_xor_si128( xmm7, \
+             casti_m128i( round_const_q, round_counter+1 ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
-    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
-    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
-    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
-    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
-    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
-    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
-    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
+    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x04070a0d00030609, \
+                                                  0x0c0f0205080b0e01 ) ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x06090c0f0205080b, \
+                                                  0x0e0104070a0d0003 ) ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x080b0e0104070a0d, \
+                                                  0x000306090c0f0205 ) ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x0e0104070a0d0003, \
+                                                  0x06090c0f0205080b ) ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x0306090c0f020508, \
+                                                  0x0b0e0104070a0d00 ) ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x05080b0e0104070a, \
+                                                  0x0d000306090c0f02 ) ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x070a0d000306090c, \
+                                                  0x0f0205080b0e0104 ) ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x090c0f0205080b0e, \
+                                                  0x0104070a0d000306 ) ); \
    /* SubBytes + MixBytes */\
-    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    SUBMIX( xmm0,  xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
+            xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
  }\
 }

@@ -278,7 +324,7 @@ __m128i ALL_FF;
 * clobbers: t0-t7
 */
 #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = TRANSP_MASK;\
+  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 );\
 \
  i6 = _mm_shuffle_epi8(i6, t0);\
  i0 = _mm_shuffle_epi8(i0, t0);\
@@ -366,7 +412,7 @@ __m128i ALL_FF;
  i4 = _mm_unpacklo_epi64(i4, i5);\
  t1 = _mm_unpackhi_epi64(t1, i5);\
  t2 = i6;\
-  o0 = TRANSP_MASK;\
+  o0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
  i6 = _mm_unpacklo_epi64(i6, i7);\
  t2 = _mm_unpackhi_epi64(t2, i7);\
  /* load transpose mask into a register, because it will be used 8 times */\
--- a/algo/groestl/aes_ni/groestl-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl-intr-avx.h
--- a/algo/groestl/aes_ni/groestl-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl-intr-vperm.h
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -1,10 +0,0 @@
-// specify assembly or intrinsics implementation
-//#define TASM
-#define TINTR
-
-// Not to be confused with AVX512VAES
-#define VAES
-// #define VAVX
-// #define VVPERM
-
-//#endif
--- a/algo/groestl/aes_ni/groestl256-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl256-asm-aes.h
@@ -1,529 +0,0 @@
-/* groestl-asm-aes.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
- * instructions.
- * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b */
-#define MUL2(i, j, k){\
-  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
-  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
-  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* t_i = a_i + a_{i+1} */\
-  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
-  /* spill values y_4, y_5 to memory */\
-  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
-  \
-  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  /* compute w_i : add y_{i+4} */\
-  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
-  MUL2(a3, b0, b1);\
-  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
-  MUL2(a4, b0, b1);\
-  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /* compute v_i : double w_i      */\
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
-  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  MUL2(a1, b0, b1);\
-  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
-  MUL2(a2, b0, b1);\
-  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-  MUL2(a5, b0, b1);\
-  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
-  MUL2(a6, b0, b1);\
-  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  MUL2(a7, b0, b1);\
-  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  MUL2(a3, b0, b1);\
-  MUL2(a4, b0, b1);\
-  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
-  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
-  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-}/*MixBytes*/
-
-#define SET_CONSTANTS(){\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-  \
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-  \
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-  \
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-  \
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-  \
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-  \
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
--- a/algo/groestl/aes_ni/groestl256-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl256-asm-avx.h
@@ -1,519 +0,0 @@
-/* groestl-asm-avx.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-
-/* global variables  */
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
-__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
-
-/* temporary variables  */
-__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}while(0);
-
-#define Push_All_Regs() do{\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}while(0);
-
-#define Pop_All_Regs() do{\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2v2(i, j, k, z){\
-  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
-  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
-  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
-  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
-  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
-  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
-  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
-  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* spill values y_4, y_5 to memory */\
-  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
-  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
-  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
-  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
-  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
-  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
-  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
-  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
-  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
-  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
-  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
-  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  /* ShiftBytes + SubBytes (interleaved) */\
-  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
-  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
-
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-\
-  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
-  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("vmovaps xmm12, [rdi+0*16]");
-  asm ("vmovaps xmm13, [rdi+1*16]");
-  asm ("vmovaps xmm14, [rdi+2*16]");
-  asm ("vmovaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("vmovaps [rdi+0*16], xmm12");
-  asm ("vmovaps [rdi+1*16], xmm2");
-  asm ("vmovaps [rdi+2*16], xmm6");
-  asm ("vmovaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("vmovaps xmm12, [rsi+0*16]");
-  asm ("vmovaps xmm13, [rsi+1*16]");
-  asm ("vmovaps xmm14, [rsi+2*16]");
-  asm ("vmovaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
-  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
-  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
-  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, xmm8");
-  asm ("vpxor xmm1, xmm1, xmm10");
-  asm ("vpxor xmm2, xmm2, xmm12");
-  asm ("vpxor xmm3, xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
-  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
-  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
-  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("vmovaps [rdi+0*16], xmm0");
-  asm ("vmovaps [rdi+1*16], xmm1");
-  asm ("vmovaps [rdi+2*16], xmm2");
-  asm ("vmovaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("vmovaps xmm8,  [rdi+0*16]");
-  asm ("vmovaps xmm10, [rdi+1*16]");
-  asm ("vmovaps xmm12, [rdi+2*16]");
-  asm ("vmovaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
-  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
-  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
-  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-
-  /* we only need to return the truncated half of the state */
-  asm ("vmovaps [rdi+2*16], xmm9");
-  asm ("vmovaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
--- a/algo/groestl/aes_ni/groestl256-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-asm-vperm.h
@@ -1,856 +0,0 @@
-/* groestl-asm-vperm.h     Aug 2011
- *
- * Groestl implementation with inline assembly using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include "hash-groestl256.h"
-
-/* global constants  */
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
-__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
-__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
-__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
-__attribute__ ((aligned (16))) unsigned char ALL_15[16];
-__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
-__attribute__ ((aligned (16))) unsigned char ALL_63[16];
-__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
-__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
-__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
-
-/* temporary variables  */
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
-__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
-__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
-  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
-  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
-  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
-  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
-  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
-  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
-  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
-  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
-  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
-  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
-  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
-  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
-  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
-  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
-  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
-  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
-  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
-  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
-  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
-  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
-  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
-  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
-  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
-  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
-  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
-  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
-  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
-  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
-  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
-/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
-  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
-  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
-  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
-  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
-  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
-  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
-  asm ("psrld  xmm"tostr(t0)", 4");\
-  asm ("psrld  xmm"tostr(t1)", 4");\
-  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
-  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
-  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
-  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
-  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
-  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
-  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
-  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
-  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
-  asm ("psrld  xmm"tostr(t0)",  4");\
-  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
-  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
-  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
-  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
-  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
-  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
-  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
-  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
-  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
-  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
-  /* 1 */\
-  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
-  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
-  \
-  /* 2 */\
-  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
-  \
-  /* 4 */\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
-  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
-  \
-  /* 3 */\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
-  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
-  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
-  \
-  /* 5 */\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
-  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
-  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
-  \
-  /* 6 */\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
-  \
-  /* 7 */\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
-  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
-  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
-  \
-  /* 8 */\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
-  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
-  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
-  \
-  /* 9 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
-  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 10 */\
-  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
-  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
-  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
-  \
-  /* 11 */\
-  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
-  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
-  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
-  \
-  /* 12 */\
-  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
-  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
-  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
-  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
-  \
-  /* 13 */\
-  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
-  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
-  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
-  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
-  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
-}/**/
-
-//#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
-  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
-  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
-  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
-  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
-  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
-  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
-  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
-  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
-  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
-  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
-  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
-  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
-  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
-  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
-  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
-    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
-    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
-  }\
-  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
-  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
-}/**/
-
-#define Push_All_Regs(){\
-/*  not using any...
-    asm("push rax");\
-    asm("push rbx");\
-    asm("push rcx");*/\
-}/**/
-
-#define Pop_All_Regs(){\
-/*  not using any...
-    asm("pop rcx");\
-    asm("pop rbx");\
-    asm("pop rax");*/\
-}/**/
-
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
-  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
-  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
-  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
-  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
-  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
-  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
-  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
-  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
-  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
-  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
-  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
-  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
-\
-  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
-\
-  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
-\
-  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
-  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
-\
-  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
-  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
-  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
-  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
-\
-  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
-\
-  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
-  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
-  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
-  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
-  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
-  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
-  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
-  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
-  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
-  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
-  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
-  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
-  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
-  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
-  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
-  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
-  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
-  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
-  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
-  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
-  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
-  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("pxor xmm1, [ALL_15]");\
-  asm ("pxor xmm2, [ALL_15]");\
-  asm ("pxor xmm3, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
-  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
-  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
-  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
-  asm ("pxor xmm0, [ALL_15]");\
-  asm ("movaps [ROUND_CONST_Lx], xmm0");\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT256(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  asm volatile ("emms");
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  asm ("movaps xmm12, [rdi+0*16]");
-  asm ("movaps xmm13, [rdi+1*16]");
-  asm ("movaps xmm14, [rdi+2*16]");
-  asm ("movaps xmm15, [rdi+3*16]");
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* store transposed IV */
-  asm ("movaps [rdi+0*16], xmm12");
-  asm ("movaps [rdi+1*16], xmm2");
-  asm ("movaps [rdi+2*16], xmm6");
-  asm ("movaps [rdi+3*16], xmm7");
-
-  asm volatile ("emms");
-  asm (".att_syntax noprefix");
-}
-
-void TF512(u64* h, u64* m)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-  /* message M in rsi            */
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load message into registers xmm12 - xmm15 (Q = message) */
-  asm ("movaps xmm12, [rsi+0*16]");
-  asm ("movaps xmm13, [rsi+1*16]");
-  asm ("movaps xmm14, [rsi+2*16]");
-  asm ("movaps xmm15, [rsi+3*16]");
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
-  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  asm ("movaps xmm8, [rdi+0*16]");
-  asm ("movaps xmm0, [rdi+1*16]");
-  asm ("movaps xmm4, [rdi+2*16]");
-  asm ("movaps xmm5, [rdi+3*16]");
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  asm ("pxor xmm8, xmm12");
-  asm ("pxor xmm0, xmm2");
-  asm ("pxor xmm4, xmm6");
-  asm ("pxor xmm5, xmm7");
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  asm ("pxor xmm0, xmm8");
-  asm ("pxor xmm1, xmm10");
-  asm ("pxor xmm2, xmm12");
-  asm ("pxor xmm3, xmm14");
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  asm ("pxor xmm0, [rdi+0*16]");
-  asm ("pxor xmm1, [rdi+1*16]");
-  asm ("pxor xmm2, [rdi+2*16]");
-  asm ("pxor xmm3, [rdi+3*16]");
-
-  /* store CV */
-  asm ("movaps [rdi+0*16], xmm0");
-  asm ("movaps [rdi+1*16], xmm1");
-  asm ("movaps [rdi+2*16], xmm2");
-  asm ("movaps [rdi+3*16], xmm3");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  /* __cdecl calling convention: */
-  /* chaining value CV in rdi    */
-
-  asm (".intel_syntax noprefix");
-  Push_All_Regs();
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  asm ("movaps xmm8,  [rdi+0*16]");
-  asm ("movaps xmm10, [rdi+1*16]");
-  asm ("movaps xmm12, [rdi+2*16]");
-  asm ("movaps xmm14, [rdi+3*16]");
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  asm ("pxor xmm8,  [rdi+0*16]");
-  asm ("pxor xmm10, [rdi+1*16]");
-  asm ("pxor xmm12, [rdi+2*16]");
-  asm ("pxor xmm14, [rdi+3*16]");
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
-  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
-
-  /* we only need to return the truncated half of the state */
-  asm ("movaps [rdi+2*16], xmm9");
-  asm ("movaps [rdi+3*16], xmm11");
-
-  Pop_All_Regs();
-  asm (".att_syntax noprefix");
-
-  return;
-}
-
-
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -11,18 +11,6 @@
 #include <wmmintrin.h>
 #include "hash-groestl256.h"

-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-//__m128i ROUND_CONST_P[ROUNDS1024];
-//__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
-
-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -113,7 +101,7 @@ __m128i ALL_FF;
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -153,24 +141,35 @@ __m128i ALL_FF;
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

-#define SET_CONSTANTS(){\
-   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0); \
+
+static const uint64_t round_const_l0[] __attribute__ ((aligned (64))) =
+{
+  0x7060504030201000, 0xffffffffffffffff,
+  0x7161514131211101, 0xffffffffffffffff,
+  0x7262524232221202, 0xffffffffffffffff,
+  0x7363534333231303, 0xffffffffffffffff,
+  0x7464544434241404, 0xffffffffffffffff,
+  0x7565554535251505, 0xffffffffffffffff,
+  0x7666564636261606, 0xffffffffffffffff,
+  0x7767574737271707, 0xffffffffffffffff,
+  0x7868584838281808, 0xffffffffffffffff,
+  0x7969594939291909, 0xffffffffffffffff
+};
+
+static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
+{
+0x0000000000000000, 0x8f9fafbfcfdfefff,
+0x0000000000000000, 0x8e9eaebecedeeefe,
+0x0000000000000000, 0x8d9dadbdcdddedfd,
+0x0000000000000000, 0x8c9cacbcccdcecfc,
+0x0000000000000000, 0x8b9babbbcbdbebfb,
+0x0000000000000000, 0x8a9aaabacadaeafa,
+0x0000000000000000, 0x8999a9b9c9d9e9f9,
+0x0000000000000000, 0x8898a8b8c8d8e8f8,
+0x0000000000000000, 0x8797a7b7c7d7e7f7,
+0x0000000000000000, 0x8696a6b6c6d6e6f6
+};
+

 /* one round
 * i = round number
@@ -179,34 +178,42 @@ __m128i ALL_FF;
 */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
+  a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
+  a1 = _mm_xor_si128( a1, b1 ); \
+  a2 = _mm_xor_si128( a2, b1 ); \
+  a3 = _mm_xor_si128( a3, b1 ); \
+  a4 = _mm_xor_si128( a4, b1 ); \
+  a5 = _mm_xor_si128( a5, b1 ); \
+  a6 = _mm_xor_si128( a6, b1 ); \
+  a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
+  a0 = _mm_shuffle_epi8( a0, m128_const_64( 0x03060a0d08020509, \
+                                            0x0c0f0104070b0e00 ) ); \
+  a0 = _mm_aesenclast_si128( a0, b0 );\
+  a1 = _mm_shuffle_epi8( a1, m128_const_64( 0x04070c0f0a03060b, \
+                                            0x0e090205000d0801 ) ); \
+  a1 = _mm_aesenclast_si128( a1, b0 );\
+  a2 = _mm_shuffle_epi8( a2, m128_const_64( 0x05000e090c04070d, \
+                                            0x080b0306010f0a02 ) ); \
+  a2 = _mm_aesenclast_si128( a2, b0 );\
+  a3 = _mm_shuffle_epi8( a3, m128_const_64( 0x0601080b0e05000f, \
+                                            0x0a0d040702090c03 ) ); \
+  a3 = _mm_aesenclast_si128( a3, b0 );\
+  a4 = _mm_shuffle_epi8( a4, m128_const_64( 0x0702090c0f060108, \
+                                            0x0b0e0500030a0d04 ) ); \
+  a4 = _mm_aesenclast_si128( a4, b0 );\
+  a5 = _mm_shuffle_epi8( a5, m128_const_64( 0x00030b0e0907020a, \
+                                            0x0d080601040c0f05 ) ); \
+  a5 = _mm_aesenclast_si128( a5, b0 );\
+  a6 = _mm_shuffle_epi8( a6, m128_const_64( 0x01040d080b00030c, \
+                                            0x0f0a0702050e0906 ) ); \
+  a6 = _mm_aesenclast_si128( a6, b0 );\
+  a7 = _mm_shuffle_epi8( a7, m128_const_64( 0x02050f0a0d01040e, \
+                                            0x090c000306080b07 ) ); \
+  a7 = _mm_aesenclast_si128( a7, b0 );\
  \
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -235,7 +242,7 @@ __m128i ALL_FF;
 * clobbers: t0
 */
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
+  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
  \
  i0 = _mm_shuffle_epi8(i0, t0);\
  i1 = _mm_shuffle_epi8(i1, t0);\
--- a/algo/groestl/aes_ni/groestl256-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl256-intr-avx.h
@@ -1,482 +0,0 @@
-/* groestl-intr-avx.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
- * instructions.
- * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
- *
- * This code is placed in the public domain
- */
-
-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-#include "hash-groestl256.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_FF;
-//#if LENGTH <= 256
-__m128i ALL_1B;
-//#else
-//__m256d ALL_1B;
-//#endif
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
-#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
-
-#define SET_CONSTANTS(){\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0);
-
-/* xmm[i] will be multiplied by 2
- * xmm[j] will be lost
- * xmm[k] has to be all 0x1b
- * xmm[z] has to be zero */
-#define VMUL2(i, j, k, z){\
-  j = _mm_cmpgt_epi8(z, i);\
-  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
-}/**/
-
-/* Yet another implementation of MixBytes.
-   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
-   Input: a0, ..., a7
-   Output: b0, ..., b7 = MixBytes(a0,...,a7).
-   but we use the relations:
-   t_i = a_i + a_{i+3}
-   x_i = t_i + t_{i+3}
-   y_i = t_i + t+{i+2} + a_{i+6}
-   z_i = 2*x_i
-   w_i = z_i + y_{i+4}
-   v_i = 2*w_i
-   b_i = v_{i+3} + y_{i+4}
-   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
-   and then adding v_i computed in the meantime in registers xmm0..xmm7.
-   We almost fit into 16 registers, need only 3 spills to memory.
-   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
-   K. Matusiewicz, 2011/05/29 */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
-  b0 = a2;\
-  b1 = a3;\
-  b2 = a4;\
-  b3 = a5;\
-  b4 = a6;\
-  b5 = a7;\
-  b6 = a0;\
-  b7 = a1;\
-  \
-  /* t_i = a_i + a_{i+1} */\
-  a0 = _mm_xor_si128(a0, a1);\
-  a1 = _mm_xor_si128(a1, a2);\
-  a2 = _mm_xor_si128(a2, a3);\
-  a3 = _mm_xor_si128(a3, a4);\
-  a4 = _mm_xor_si128(a4, a5);\
-  a5 = _mm_xor_si128(a5, a6);\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b3 = _mm_xor_si128(b3, a7);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b5 = _mm_xor_si128(b5, a1);\
-  b6 = _mm_xor_si128(b6, a2);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  b0 = _mm_xor_si128(b0, a6);\
-  b1 = _mm_xor_si128(b1, a7);\
-  b2 = _mm_xor_si128(b2, a0);\
-  b3 = _mm_xor_si128(b3, a1);\
-  b4 = _mm_xor_si128(b4, a2);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  TEMP1 = b1;\
-  TEMP2 = b2;\
-  \
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b1 = a1;\
-  TEMP3 = a2;\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP3);\
-  \
-  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  b1 = ALL_1B;\
-  b2 = _mm_xor_si128(b2, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a0, b0, b1, b2);\
-  \
-  /* compute w_i :  add y_{i+4} */\
-  a0 = _mm_xor_si128(a0, TEMP0);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
-  a2 = _mm_xor_si128(a2, TEMP2);\
-  a3 = _mm_xor_si128(a3, b3);\
-  a4 = _mm_xor_si128(a4, b4);\
-  a5 = _mm_xor_si128(a5, b5);\
-  a6 = _mm_xor_si128(a6, b6);\
-  a7 = _mm_xor_si128(a7, b7);\
-  \
-  /*compute v_i: double w_i */\
-  VMUL2(a0, b0, b1, b2);\
-  VMUL2(a1, b0, b1, b2);\
-  VMUL2(a2, b0, b1, b2);\
-  VMUL2(a3, b0, b1, b2);\
-  VMUL2(a4, b0, b1, b2);\
-  VMUL2(a5, b0, b1, b2);\
-  VMUL2(a6, b0, b1, b2);\
-  VMUL2(a7, b0, b1, b2);\
-  \
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  b0 = _mm_xor_si128(a3, TEMP0);\
-  b1 = _mm_xor_si128(a4, TEMP1);\
-  b2 = _mm_xor_si128(a5, TEMP2);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b6 = _mm_xor_si128(b6, a1);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/*MixBytes*/
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* Add Round Constant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  \
-  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
-  \
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-  \
-  o1 = _mm_unpackhi_epi16(i0, i1);\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  t0 = _mm_unpackhi_epi16(i2, i3);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  \
-  o2 = _mm_unpackhi_epi32(i0, i2);\
-  o3 = _mm_unpackhi_epi32(o1, t0);\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = _mm_unpackhi_epi64(i0, i4);\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o2 = _mm_unpacklo_epi64(i1, i5);\
-  o3 = _mm_unpackhi_epi64(i1, i5);\
-  o4 = _mm_unpacklo_epi64(i2, i6);\
-  o5 = _mm_unpackhi_epi64(i2, i6);\
-  o6 = _mm_unpacklo_epi64(i3, i7);\
-  o7 = _mm_unpackhi_epi64(i3, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = _mm_unpackhi_epi64(i0, i1);\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o1 = _mm_unpackhi_epi64(i2, i3);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o2 = _mm_unpackhi_epi64(i4, i5);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o3 = _mm_unpackhi_epi64(i6, i7);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = _mm_unpackhi_epi64(i0, t0);\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i3 = _mm_unpackhi_epi64(i2, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i5 = _mm_unpackhi_epi64(i4, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i7 = _mm_unpackhi_epi64(i6, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-void INIT256(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value and xor message to CV to get input of P */
-  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
-  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
-  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
-  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
-  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
-  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
-  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-  static __m128i TEMP3;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-}
-
-
--- a/algo/groestl/aes_ni/groestl256-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-intr-vperm.h
@@ -1,793 +0,0 @@
-/* groestl-intr-vperm.h     Aug 2011
- *
- * Groestl implementation with intrinsics using ssse3 instructions.
- * Author: Günther A. Roland, Martin Schläffer
- *
- * Based on the vperm and aes_ni implementations of the hash function Groestl
- * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
- * Institute of Applied Mathematics, Middle East Technical University, Turkey
- *
- * This code is placed in the public domain
- */
-
-#include <tmmintrin.h>
-#include "hash-groestl256.h"
-
-/* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_0F;
-__m128i ALL_15;
-__m128i ALL_1B;
-__m128i ALL_63;
-__m128i ALL_FF;
-__m128i VPERM_IPT[2];
-__m128i VPERM_OPT[2];
-__m128i VPERM_INV[2];
-__m128i VPERM_SB1[2];
-__m128i VPERM_SB2[2];
-__m128i VPERM_SB4[2];
-__m128i VPERM_SBO[2];
-
-
-#define tos(a)    #a
-#define tostr(a)  tos(a)
-
-#define SET_SHARED_CONSTANTS(){\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
-  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
-  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
-  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
-  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
-  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
-  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
-  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
-  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
-  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
-  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
-  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
-  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
-  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
-  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
-}/**/
-
-/* VPERM
- * Transform w/o settings c*
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
-  t0 = c0;\
-  t1 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t1 = _mm_andnot_si128(t1, a1);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  t1 = _mm_srli_epi32(t1, 4);\
-  a0 = _mm_and_si128(a0, c0);\
-  a1 = _mm_and_si128(a1, c0);\
-  t2 = c2;\
-  t3 = c2;\
-  t2 = _mm_shuffle_epi8(t2, a0);\
-  t3 = _mm_shuffle_epi8(t3, a1);\
-  a0 = c1;\
-  a1 = c1;\
-  a0 = _mm_shuffle_epi8(a0, t0);\
-  a1 = _mm_shuffle_epi8(a1, t1);\
-  a0 = _mm_xor_si128(a0, t2);\
-  a1 = _mm_xor_si128(a1, t3);\
-}/**/
-
-#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
-  c0 = ALL_0F;\
-  c1 = ((__m128i*) table )[0];\
-  c2 = ((__m128i*) table )[1];\
-}/**/
-
-/* VPERM
- * Transform
- * transforms 2 rows to/from "vperm mode"
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0, a1 = 2 rows
- * table = transformation table to use
- * t*, c* = clobbers
- * outputs:
- * a0, a1 = 2 rows transformed with table
- * */
-#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Transform State
- * inputs:
- * a0-a3 = state
- * table = transformation table to use
- * t* = clobbers
- * outputs:
- * a0-a3 = transformed state
- * */
-#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
-  VPERM_Transform_Set_Const(table, c0, c1, c2);\
-  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
-  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Add Constant to State
- * inputs:
- * a0-a7 = state
- * constant = constant to add
- * t0 = clobber
- * outputs:
- * a0-a7 = state + constant
- * */
-#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
-  t0 = constant;\
-  a0 = _mm_xor_si128(a0,  t0);\
-  a1 = _mm_xor_si128(a1,  t0);\
-  a2 = _mm_xor_si128(a2,  t0);\
-  a3 = _mm_xor_si128(a3,  t0);\
-  a4 = _mm_xor_si128(a4,  t0);\
-  a5 = _mm_xor_si128(a5,  t0);\
-  a6 = _mm_xor_si128(a6,  t0);\
-  a7 = _mm_xor_si128(a7,  t0);\
-}/**/
-
-/* VPERM
- * Set Substitute Core Constants
- * */
-#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
-  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
-}/**/
-
-/* VPERM
- * Substitute Core
- * first part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0 = 1 row
- * t*, c* = clobbers
- * outputs:
- * b0a, b0b = inputs for lookup step
- * */
-#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
-  t0 = c0;\
-  t0 = _mm_andnot_si128(t0, a0);\
-  t0 = _mm_srli_epi32(t0, 4);\
-  a0 = _mm_and_si128(a0,  c0);\
-  b0a = c1;\
-  b0a = _mm_shuffle_epi8(b0a, a0);\
-  a0 = _mm_xor_si128(a0,  t0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t0);\
-  b0b = _mm_xor_si128(b0b, b0a);\
-  t1 = c2;\
-  t1 = _mm_shuffle_epi8(t1,  a0);\
-  t1 = _mm_xor_si128(t1,  b0a);\
-  b0a = c2;\
-  b0a = _mm_shuffle_epi8(b0a, b0b);\
-  b0a = _mm_xor_si128(b0a, a0);\
-  b0b = c2;\
-  b0b = _mm_shuffle_epi8(b0b, t1);\
-  b0b = _mm_xor_si128(b0b, t0);\
-}/**/
-
-/* VPERM
- * Lookup
- * second part of sbox inverse computation
- * this function is derived from:
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0a, a0b = output of Substitution Core
- * table = lookup table to use (*1 / *2 / *4)
- * t0 = clobber
- * outputs:
- * b0 = output of sbox + multiplication
- * */
-#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
-  b0 = ((__m128i*) table )[0];\
-  t0 = ((__m128i*) table )[1];\
-  b0 = _mm_shuffle_epi8(b0, a0b);\
-  t0 = _mm_shuffle_epi8(t0, a0a);\
-  b0 = _mm_xor_si128(b0, t0);\
-}/**/
-
-/* VPERM
- * SubBytes and *2 / *4
- * this function is derived from:
- *   Constant-time SSSE3 AES core implementation
- *   by Mike Hamburg
- * and
- *   vperm and aes_ni implementations of hash function Grostl
- *   by Cagdas CALIK
- * inputs:
- * a0-a7 = state
- * t*, c* = clobbers
- * outputs:
- * a0-a7 = state * 4
- * c2 = row0 * 2 -> b0
- * c1 = row7 * 2 -> b3
- * c0 = row7 * 1 -> b4
- * t2 = row4 * 1 -> b7
- * TEMP_MUL1 = row(i) * 1
- * TEMP_MUL2 = row(i) * 2
- *
- * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
-#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
-  /* set Constants */\
-  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
-  /* row 1 */\
-  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[1] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[1] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
-  /* --- */\
-  /* row 2 */\
-  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[2] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[2] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
-  /* --- */\
-  /* row 3 */\
-  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[3] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[3] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
-  /* --- */\
-  /* row 5 */\
-  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[5] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[5] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
-  /* --- */\
-  /* row 6 */\
-  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[6] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[6] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
-  /* --- */\
-  /* row 7 */\
-  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
-  TEMP_MUL1[7] = t2;\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
-  /* --- */\
-  /* row 4 */\
-  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
-  TEMP_MUL2[4] = t3;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
-  /* --- */\
-  /* row 0 */\
-  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
-  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
-  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
-  TEMP_MUL2[0] = c2;\
-  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
-  /* --- */\
-}/**/
-
-
-/* Optimized MixBytes
- * inputs:
- * a0-a7 = (row0-row7) * 4
- * b0 = row0 * 2
- * b3 = row7 * 2
- * b4 = row7 * 1
- * b7 = row4 * 1
- * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
- * output: b0-b7
- * */
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* save one value */\
-  TEMP_MUL4 = a3;\
-  /* 1 */\
-  b1 = a0;\
-  b1 = _mm_xor_si128(b1, a5);\
-  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
-  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
-  b2 = b1;\
-  \
-  /* 2 */\
-  b5 = a1;\
-  b5 = _mm_xor_si128(b5, a4);\
-  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
-  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
-  b6 = b5;\
-  \
-  /* 4 */\
-  b7 = _mm_xor_si128(b7, a6);\
-  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
-  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
-  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
-  b2 = _mm_xor_si128(b2, b7);\
-  \
-  /* 3 */\
-  b0 = _mm_xor_si128(b0, a7);\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
-  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
-  b3 = b0;\
-  b1 = _mm_xor_si128(b1, b0);\
-  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
-  \
-  /* 5 */\
-  b4 = _mm_xor_si128(b4, a2);\
-  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
-  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
-  b3 = _mm_xor_si128(b3, b4);\
-  b6 = _mm_xor_si128(b6, b4);\
-  \
-  /* 6 */\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
-  b4 = _mm_xor_si128(b4, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  b7 = _mm_xor_si128(b7, a3);\
-  \
-  /* 7 */\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
-  b2 = _mm_xor_si128(b2, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
-  \
-  /* 8 */\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
-  b6 = _mm_xor_si128(b6, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  \
-  /* 9 */\
-  a3 = TEMP_MUL1[2];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
-  b0 = _mm_xor_si128(b0, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 10 */\
-  a1 = TEMP_MUL1[6];\
-  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
-  b1 = _mm_xor_si128(b1, a1);\
-  b4 = _mm_xor_si128(b4, a1);\
-  \
-  /* 11 */\
-  a5 = TEMP_MUL1[3];\
-  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
-  b1 = _mm_xor_si128(b1, a5);\
-  b6 = _mm_xor_si128(b6, a5);\
-  \
-  /* 12 */\
-  a3 = TEMP_MUL1[7];\
-  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
-  b2 = _mm_xor_si128(b2, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
-  \
-  /* 13 */\
-  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
-  b0 = _mm_xor_si128(b0, a4);\
-  b1 = _mm_xor_si128(b1, a4);\
-  b3 = _mm_xor_si128(b3, a6);\
-  b4 = _mm_xor_si128(b4, a0);\
-  b4 = _mm_xor_si128(b4, a7);\
-  b5 = _mm_xor_si128(b5, a0);\
-  b7 = _mm_xor_si128(b7, a2);\
-}/**/
-
-#define SET_CONSTANTS(){\
-  SET_SHARED_CONSTANTS();\
-  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}/**/
-
-/* vperm:
- * transformation before rounds with ipt
- * first round add transformed constant
- * middle rounds: add constant XOR 0x15...15
- * last round: additionally add 0x15...15 after MB
- * transformation after rounds with opt
- */
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant + ShiftBytes (interleaved) */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a4 = _mm_xor_si128(a4, b1);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  /* SubBytes + Multiplication by 2 and 4 */\
-  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-}/**/
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
-}
-
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-\
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-\
-  o1 = i0;\
-  t0 = i2;\
-\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
-\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-\
-  o2 = i0;\
-  o3 = o1;\
-\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = i0;\
-  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
-  o3 = i1;\
-  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
-  o5 = i2;\
-  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
-  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
-  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
-  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
-  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = i0;\
-  i3 = i2;\
-  i5 = i4;\
-  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-}/**/
-
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst_CNT2(i, j){\
-  xmm0 = ROUND_CONST_L0[i];\
-  xmm1 = ROUND_CONST_L7[i];\
-  xmm2 = ROUND_CONST_L0[j];\
-  xmm3 = ROUND_CONST_L7[j];\
-  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
-  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
-  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
-  ROUND_CONST_L0[i] = xmm0;\
-  ROUND_CONST_L7[i] = xmm1;\
-  ROUND_CONST_L0[j] = xmm2;\
-  ROUND_CONST_L7[j] = xmm3;\
-}/**/
-
-/* transform round constants into VPERM mode */
-#define VPERM_Transform_RoundConst(){\
-  xmm0 = ROUND_CONST_Lx;\
-  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
-  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
-  ROUND_CONST_Lx = xmm0;\
-  VPERM_Transform_RoundConst_CNT2(0, 1);\
-  VPERM_Transform_RoundConst_CNT2(2, 3);\
-  VPERM_Transform_RoundConst_CNT2(4, 5);\
-  VPERM_Transform_RoundConst_CNT2(6, 7);\
-  VPERM_Transform_RoundConst_CNT2(8, 9);\
-}/**/
-
-void INIT256(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* transform round constants into VPERM mode */
-  VPERM_Transform_RoundConst();
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  xmm8 = chaining[0];
-  xmm0 = chaining[1];
-  xmm4 = chaining[2];
-  xmm5 = chaining[3];
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP_MUL1[8];
-  static __m128i TEMP_MUL2[8];
-  static __m128i TEMP_MUL4;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-
-  return;
-}//OF512()
-
-
-
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -16,48 +16,13 @@

 #ifdef __AES__

-#include "groestl-version.h"
-
-#ifdef TASM
-  #ifdef VAES
-    #include "groestl-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
+#include "groestl-intr-aes.h"

 HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 {
  int i;

  ctx->hashlen = hashlen;
-  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;
@@ -70,8 +35,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -92,8 +55,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
     ctx->buffer[i]   = _mm_setzero_si128();
  }
  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-//  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -109,7 +70,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 // 5. Midstate will work at reduced impact than full hash, if total hash
 //    (midstate + tail) is less than 1 block.
 //    This, unfortunately, is the case with all current users.
-// 6. the morefull blocks the bigger the gain
+// 6. the more full blocks the bigger the gain

 // use only for midstate precalc
 HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
@@ -143,12 +104,11 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
 // deprecated do not use
 HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
 {
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
-   const int blocks = ctx->blk_count + 1;       // adjust for final block
-
-   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i;  // where in buffer
+   const int len = (int)ctx->databitlen / 128; // bits to __m128i 
+   const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
+   const int rem_ptr = ctx->rem_ptr;           // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;     // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
   int i;

   // first pad byte = 0x80, last pad byte = block count
@@ -157,21 +117,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   if ( rem_ptr == len - 1 )
   {
       // only 128 bits left in buffer, all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                                  0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                            0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
       // add zero padding
       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
   }

   // digest final padding block and do output transform
@@ -189,21 +146,20 @@ int groestl512_full( hashState_groestl* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {

-  int i;
-
-  ctx->hashlen = 64;
-  SET_CONSTANTS();
-
-  for ( i = 0; i < SIZE512; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
+   int i;
+   ctx->hashlen = 64;

+   for ( i = 0; i < SIZE512; i++ )
+   {
+      ctx->chaining[i] = _mm_setzero_si128();
+      ctx->buffer[i]   = _mm_setzero_si128();
+   }
+   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;

+   // --- update ---
+   
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
@@ -211,8 +167,6 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   uint64_t blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;

-   // --- update ---
-
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
@@ -231,26 +185,22 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {
       // only 128 bits left in buffer, all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                      0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[i] = m128_const_64( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); 
   }

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
-
   OF1024( ctx->chaining );

   // store hash result in output 
@@ -268,7 +218,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
-   int blocks = len / SIZE512;
+   uint64_t blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;
   int i;

@@ -292,26 +242,22 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {        
       // only 128 bits left in buffer, all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0x80 );
+      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
   }   
   else
   {
       // add first padding
-       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
-                                      0,0,0,0, 0,0,0,0x80 );
+       ctx->buffer[i] = m128_const_64( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
-                                           0,         0 ,0,0, 0,0,0,0 );
+       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
   }

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
-
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -13,41 +13,7 @@

 #ifdef __AES__

-#include "groestl-version.h"
-
-#ifdef TASM
-  #ifdef VAES
-    #include "groestl256-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl256-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl256-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl256-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl256-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl256-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
+#include "groestl256-intr-aes.h"

 /* initialise context */
 HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
@@ -55,7 +21,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
  int i;

  ctx->hashlen = hashlen;
-  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;