Initial upload v3.4.7

2026-02-22 16:33:08 +00:00 · 2016-09-22 13:16:18 -04:00
parent a3c8079774
commit a35039bc05
480 changed files with 211015 additions and 3 deletions
--- a/algo/groestl/sse2/.dirstamp
+++ b/algo/groestl/sse2/.dirstamp
--- a/algo/groestl/sse2/.grso.c.swo
+++ b/algo/groestl/sse2/.grso.c.swo
--- a/algo/groestl/sse2/brg_endian.h
+++ b/algo/groestl/sse2/brg_endian.h
@@ -0,0 +1,133 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
--- a/algo/groestl/sse2/brg_types.h
+++ b/algo/groestl/sse2/brg_types.h
@@ -0,0 +1,231 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ (a few lines added by Soeren S. Thomsen, October 2008)
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef _BRG_TYPES_H
+#define _BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
+#  include <stddef.h>
+#  define ptrint_t intptr_t
+#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
+#  include <stdint.h>
+#  define ptrint_t intptr_t
+#else
+#  define ptrint_t int
+#endif
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( __MVS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned int long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  endif
+#endif
+
+#if !defined( BRG_UI64 )
+#  if defined( NEED_UINT_64T )
+#    error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#  endif
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*	These defines are used to detect and set the memory alignment of pointers.
+    Note that offsets are in bytes.
+
+	ALIGN_OFFSET(x,n)			return the positive or zero offset of 
+								the memory addressed by the pointer 'x' 
+								from an address that is aligned on an 
+								'n' byte boundary ('n' is a power of 2)
+
+	ALIGN_FLOOR(x,n)			return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not higher than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+
+	ALIGN_CEIL(x,n)				return a pointer that points to memory
+								that is aligned on an 'n' byte boundary 
+								and is not lower than the memory address
+								pointed to by 'x' ('n' is a power of 2)
+*/
+
+#define ALIGN_OFFSET(x,n)	(((ptrint_t)(x)) & ((n) - 1))
+#define ALIGN_FLOOR(x,n)	((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
+#define ALIGN_CEIL(x,n)		((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8. NOTE that the 
+    buffer size is in bytes but the type length is in bits
+
+    UNIT_TYPEDEF(x,size)        declares a variable 'x' of length 
+                                'size' bits
+
+    BUFR_TYPEDEF(x,size,bsize)  declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    UNIT_CAST(x,size)           casts a variable to a type of 
+                                length 'size' bits
+
+    UPTR_CAST(x,size)           casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define UI_TYPE(size)               uint_##size##t
+#define UNIT_TYPEDEF(x,size)        typedef UI_TYPE(size) x
+#define BUFR_TYPEDEF(x,size,bsize)  typedef UI_TYPE(size) x[bsize / (size >> 3)]
+#define UNIT_CAST(x,size)           ((UI_TYPE(size) )(x))  
+#define UPTR_CAST(x,size)           ((UI_TYPE(size)*)(x))
+
+  /* Added by Soeren S. Thomsen (begin) */
+#define u8 uint_8t
+#define u32 uint_32t
+#define u64 uint_64t
+  /* (end) */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/groestl/sse2/groestl.c
+++ b/algo/groestl/sse2/groestl.c
--- a/algo/groestl/sse2/grsi-asm.h
+++ b/algo/groestl/sse2/grsi-asm.h
@@ -0,0 +1,956 @@
+/* groestl-intr-vperm.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include <tmmintrin.h>
+#include "grsi.h"
+
+/*define data alignment for different C compilers*/
+#if defined(__GNUC__)
+      #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
+#else
+      #define DATA_ALIGN16(x) __declspec(align(16)) x
+#endif
+
+//#if defined(DECLARE_GLOBAL)
+#if 1
+#define GLOBAL
+#else
+#define GLOBAL extern
+#endif
+
+//#if defined(DECLARE_IFUN)
+#if 1
+#define IFUN
+#else
+#define IFUN extern
+#endif
+
+/* global constants  */
+//GLOBAL __m128i grsiROUND_CONST_Lx;
+//GLOBAL __m128i grsiROUND_CONST_L0[grsiROUNDS512];
+//GLOBAL __m128i grsiROUND_CONST_L7[grsiROUNDS512];
+DATA_ALIGN16(int32_t grsiSUBSH_MASK_short[8*4]) = {
+    0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
+    0x04030201, 0x08070605, 0x0c0b0a09, 0x000f0e0d,
+    0x05040302, 0x09080706, 0x0d0c0b0a, 0x01000f0e,
+    0x06050403, 0x0a090807, 0x0e0d0c0b, 0x0201000f,
+    0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100,
+    0x08070605, 0x0c0b0a09, 0x000f0e0d, 0x04030201,
+    0x09080706, 0x0d0c0b0a, 0x01000f0e, 0x05040302,
+    0x0e0d0c0b, 0x0201000f, 0x06050403, 0x0a090807
+};
+GLOBAL __m128i *grsiSUBSH_MASK = grsiSUBSH_MASK_short;
+GLOBAL __m128i grsiALL_0F = {0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f};
+GLOBAL __m128i grsiALL_1B = {0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b};
+GLOBAL __m128i grsiALL_FF = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
+
+/* global unsknown */
+
+
+GLOBAL __m128i grsiVPERM_OPT[2];
+GLOBAL __m128i grsiVPERM_INV[2];
+GLOBAL __m128i grsiVPERM_SB1[2];
+GLOBAL __m128i grsiVPERM_SB2[2];
+GLOBAL __m128i grsiVPERM_SB4[2];
+GLOBAL __m128i grsiVPERM_SBO[2];
+
+/* state vars */
+GLOBAL __m128i grsiTRANSP_MASK;
+GLOBAL __m128i grsiVPERM_IPT[2];
+GLOBAL __m128i grsiALL_15;
+GLOBAL __m128i grsiALL_63;
+GLOBAL __m128i grsiROUND_CONST_P[grsiROUNDS1024];
+GLOBAL __m128i grsiROUND_CONST_Q[grsiROUNDS1024];
+
+#define grsitos(a)    #a
+#define grsitostr(a)  grsitos(a)
+
+/*
+  grsiALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  grsiALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
+*/
+
+#define grsiSET_SHARED_CONSTANTS(){\
+  grsiTRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  grsiALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
+  grsiALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
+\
+  grsiVPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
+  grsiVPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
+  grsiVPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
+  grsiVPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
+  grsiVPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
+  grsiVPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
+  grsiVPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
+  grsiVPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
+  grsiVPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
+  grsiVPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
+  grsiVPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
+  grsiVPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
+}/**/
+
+/* grsiVPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  t0 = c0;\
+  t1 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t1 = _mm_andnot_si128(t1, a1);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  t1 = _mm_srli_epi32(t1, 4);\
+  a0 = _mm_and_si128(a0, c0);\
+  a1 = _mm_and_si128(a1, c0);\
+  t2 = c2;\
+  t3 = c2;\
+  t2 = _mm_shuffle_epi8(t2, a0);\
+  t3 = _mm_shuffle_epi8(t3, a1);\
+  a0 = c1;\
+  a1 = c1;\
+  a0 = _mm_shuffle_epi8(a0, t0);\
+  a1 = _mm_shuffle_epi8(a1, t1);\
+  a0 = _mm_xor_si128(a0, t2);\
+  a1 = _mm_xor_si128(a1, t3);\
+}/**/
+
+#define grsiVPERM_Transform_Set_Const(table, c0, c1, c2){\
+  c0 = grsiALL_0F;\
+  c1 = ((__m128i*) table )[0];\
+  c2 = ((__m128i*) table )[1];\
+}/**/
+
+/* grsiVPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define grsiVPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\
+  grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* grsiVPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define grsiVPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\
+  grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  grsiVPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* grsiVPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define grsiVPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  t0 = constant;\
+  a0 = _mm_xor_si128(a0,  t0);\
+  a1 = _mm_xor_si128(a1,  t0);\
+  a2 = _mm_xor_si128(a2,  t0);\
+  a3 = _mm_xor_si128(a3,  t0);\
+  a4 = _mm_xor_si128(a4,  t0);\
+  a5 = _mm_xor_si128(a5,  t0);\
+  a6 = _mm_xor_si128(a6,  t0);\
+  a7 = _mm_xor_si128(a7,  t0);\
+}/**/
+
+/* grsiVPERM
+ * Set Substitute Core Constants
+ * */
+#define grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  grsiVPERM_Transform_Set_Const(grsiVPERM_INV, c0, c1, c2);\
+}/**/
+
+/* grsiVPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define grsiVPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  t0 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  a0 = _mm_and_si128(a0,  c0);\
+  b0a = c1;\
+  b0a = _mm_shuffle_epi8(b0a, a0);\
+  a0 = _mm_xor_si128(a0,  t0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t0);\
+  b0b = _mm_xor_si128(b0b, b0a);\
+  t1 = c2;\
+  t1 = _mm_shuffle_epi8(t1,  a0);\
+  t1 = _mm_xor_si128(t1,  b0a);\
+  b0a = c2;\
+  b0a = _mm_shuffle_epi8(b0a, b0b);\
+  b0a = _mm_xor_si128(b0a, a0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t1);\
+  b0b = _mm_xor_si128(b0b, t0);\
+}/**/
+
+/* grsiVPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define grsiVPERM_Lookup(a0a, a0b, table, b0, t0){\
+  b0 = ((__m128i*) table )[0];\
+  t0 = ((__m128i*) table )[1];\
+  b0 = _mm_shuffle_epi8(b0, a0b);\
+  t0 = _mm_shuffle_epi8(t0, a0a);\
+  b0 = _mm_xor_si128(b0, t0);\
+}/**/
+
+/* grsiVPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  grsiVPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
+  TEMP_MUL1[1] = t2;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
+  TEMP_MUL2[1] = t3;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  grsiVPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
+  TEMP_MUL1[2] = t2;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
+  TEMP_MUL2[2] = t3;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  grsiVPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
+  TEMP_MUL1[3] = t2;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
+  TEMP_MUL2[3] = t3;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  grsiVPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
+  TEMP_MUL1[5] = t2;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
+  TEMP_MUL2[5] = t3;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  grsiVPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
+  TEMP_MUL1[6] = t2;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
+  TEMP_MUL2[6] = t3;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  grsiVPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
+  TEMP_MUL1[7] = t2;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c1, t4); /*c1 -> b3*/\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  grsiVPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4); /*t2 -> b7*/\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
+  TEMP_MUL2[4] = t3;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  grsiVPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, c0, t4); /*c0 -> b4*/\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c2, t4); /*c2 -> b0*/\
+  TEMP_MUL2[0] = c2;\
+  grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized grsiMixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  TEMP_MUL4 = a3;\
+  /* 1 */\
+  b1 = a0;\
+  b1 = _mm_xor_si128(b1, a5);\
+  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
+  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
+  b2 = b1;\
+  \
+  /* 2 */\
+  b5 = a1;\
+  b5 = _mm_xor_si128(b5, a4);\
+  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
+  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
+  b6 = b5;\
+  \
+  /* 4 */\
+  b7 = _mm_xor_si128(b7, a6);\
+  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
+  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
+  b2 = _mm_xor_si128(b2, b7);\
+  \
+  /* 3 */\
+  b0 = _mm_xor_si128(b0, a7);\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
+  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
+  b3 = b0;\
+  b1 = _mm_xor_si128(b1, b0);\
+  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
+  \
+  /* 5 */\
+  b4 = _mm_xor_si128(b4, a2);\
+  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
+  b3 = _mm_xor_si128(b3, b4);\
+  b6 = _mm_xor_si128(b6, b4);\
+  \
+  /* 6 */\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
+  b4 = _mm_xor_si128(b4, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  /* 7 */\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
+  b2 = _mm_xor_si128(b2, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  \
+  /* 8 */\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
+  b6 = _mm_xor_si128(b6, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* 9 */\
+  a3 = TEMP_MUL1[2];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
+  b0 = _mm_xor_si128(b0, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 10 */\
+  a1 = TEMP_MUL1[6];\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
+  b1 = _mm_xor_si128(b1, a1);\
+  b4 = _mm_xor_si128(b4, a1);\
+  \
+  /* 11 */\
+  a5 = TEMP_MUL1[3];\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
+  b1 = _mm_xor_si128(b1, a5);\
+  b6 = _mm_xor_si128(b6, a5);\
+  \
+  /* 12 */\
+  a3 = TEMP_MUL1[7];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
+  b2 = _mm_xor_si128(b2, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 13 */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a4);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/**/
+
+/*
+  grsiSUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\
+  grsiSUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\
+  grsiSUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\
+  grsiSUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\
+  grsiSUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\
+  grsiSUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\
+  grsiSUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\
+  grsiSUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\
+*/
+
+#define grsiSET_CONSTANTS(){\
+  grsiSET_SHARED_CONSTANTS();\
+  grsiALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  for(i = 0; i < grsiROUNDS1024; i++)\
+  {\
+    grsiROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    grsiROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
+  }\
+}/**/
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define grsiSUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes + Multiplication */\
+  grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* grsiMixBytes */\
+  grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+#define grsiROUNDS_P(){\
+  u32 round_counter;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm_xor_si128(xmm8, (grsiROUND_CONST_P[round_counter]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8 = _mm_shuffle_epi8(xmm8,  (grsiSUBSH_MASK[0]));\
+    xmm9 = _mm_shuffle_epi8(xmm9,  (grsiSUBSH_MASK[1]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[2]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[3]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[4]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[5]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[6]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[7]));\
+    /* SubBytes + grsiMixBytes */\
+    grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    grsiVPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, grsiALL_15, xmm8);\
+    \
+    /* AddRoundConstant P1024 */\
+    xmm0 = _mm_xor_si128(xmm0, (grsiROUND_CONST_P[round_counter+1]));\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[0]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[1]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[2]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[3]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[4]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[5]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[6]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[7]));\
+    /* SubBytes + grsiMixBytes */\
+    grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+    grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm0);\
+  }\
+}/**/
+
+#define grsiROUNDS_Q(){\
+  grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\
+  u32 round_counter = 0;\
+  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
+    /* AddRoundConstant Q1024 */\
+    xmm1 = grsiALL_FF;\
+    xmm8 = _mm_xor_si128(xmm8, xmm1);\
+    xmm9 = _mm_xor_si128(xmm9, xmm1);\
+    xmm10 = _mm_xor_si128(xmm10, xmm1);\
+    xmm11 = _mm_xor_si128(xmm11, xmm1);\
+    xmm12 = _mm_xor_si128(xmm12, xmm1);\
+    xmm13 = _mm_xor_si128(xmm13, xmm1);\
+    xmm14 = _mm_xor_si128(xmm14, xmm1);\
+    xmm15 = _mm_xor_si128(xmm15, (grsiROUND_CONST_Q[round_counter]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8 = _mm_shuffle_epi8(xmm8, (grsiSUBSH_MASK[1]));\
+    xmm9 = _mm_shuffle_epi8(xmm9, (grsiSUBSH_MASK[3]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[5]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[7]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[0]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[2]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[4]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[6]));\
+    /* SubBytes + grsiMixBytes */\
+    grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = grsiALL_FF;\
+    xmm0 = _mm_xor_si128(xmm0, xmm9);\
+    xmm1 = _mm_xor_si128(xmm1, xmm9);\
+    xmm2 = _mm_xor_si128(xmm2, xmm9);\
+    xmm3 = _mm_xor_si128(xmm3, xmm9);\
+    xmm4 = _mm_xor_si128(xmm4, xmm9);\
+    xmm5 = _mm_xor_si128(xmm5, xmm9);\
+    xmm6 = _mm_xor_si128(xmm6, xmm9);\
+    xmm7 = _mm_xor_si128(xmm7, (grsiROUND_CONST_Q[round_counter+1]));\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[1]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[3]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[5]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[7]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[0]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[2]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[4]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[6]));\
+    /* SubBytes + grsiMixBytes*/ \
+    grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+  grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\
+}/**/
+
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define grsiMatrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = grsiTRANSP_MASK;\
+\
+  i6 = _mm_shuffle_epi8(i6, t0);\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm_shuffle_epi8(i4, t0);\
+  i5 = _mm_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm_unpackhi_epi16(t2, i5);\
+  i4 = _mm_unpacklo_epi16(i4, i5);\
+  t3 = _mm_unpackhi_epi16(t3, i7);\
+  i6 = _mm_unpacklo_epi16(i6, i7);\
+  t0 = _mm_unpackhi_epi16(t0, i1);\
+  t1 = _mm_unpackhi_epi16(t1, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t2 = _mm_shuffle_epi32(t2, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  i6 = _mm_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm_unpacklo_epi32(i0,  i2);\
+  t4 = _mm_unpackhi_epi32(t4,  i2);\
+  t5 = t0;\
+  t0 = _mm_unpacklo_epi32(t0,  t1);\
+  t5 = _mm_unpackhi_epi32(t5,  t1);\
+  t6 = i4;\
+  i4 = _mm_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm_unpackhi_epi32(t6,  i6);\
+  i2 = t0;\
+  t2 = _mm_unpacklo_epi32(t2,  t3);\
+  i3 = t0;\
+  t7 = _mm_unpackhi_epi32(t7,  t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm_unpackhi_epi64(i1, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm_unpacklo_epi64(i4, t6);\
+  i7 = _mm_unpackhi_epi64(i7, t7);\
+  i6 = _mm_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define grsiMatrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  t0 = _mm_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  t1 = _mm_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = grsiTRANSP_MASK;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  t2 = _mm_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm_shuffle_epi8(i0, o0);\
+  i2 = _mm_shuffle_epi8(i2, o0);\
+  i4 = _mm_shuffle_epi8(i4, o0);\
+  i6 = _mm_shuffle_epi8(i6, o0);\
+  o1 = _mm_shuffle_epi8(o1, o0);\
+  t0 = _mm_shuffle_epi8(t0, o0);\
+  t1 = _mm_shuffle_epi8(t1, o0);\
+  t2 = _mm_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm_unpackhi_epi16(t3, i6);\
+  i4 = _mm_unpacklo_epi16(i4, i6);\
+  o0 = _mm_unpackhi_epi16(o0, i2);\
+  i0 = _mm_unpacklo_epi16(i0, i2);\
+  o2 = _mm_unpackhi_epi16(o2, t0);\
+  o1 = _mm_unpacklo_epi16(o1, t0);\
+  t4 = _mm_unpackhi_epi16(t4, t2);\
+  t1 = _mm_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm_shuffle_epi32(i4, 216);\
+  t3 = _mm_shuffle_epi32(t3, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  o2 = _mm_shuffle_epi32(o2, 216);\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o0 = _mm_shuffle_epi32(o0, 216);\
+  t1 = _mm_shuffle_epi32(t1, 216);\
+  t4 = _mm_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm_unpacklo_epi32(i0, i4);\
+  i1 = _mm_unpackhi_epi32(i1, i4);\
+  o0 = _mm_unpacklo_epi32(o0, t3);\
+  i3 = _mm_unpackhi_epi32(i3, t3);\
+  o1 = _mm_unpacklo_epi32(o1, t1);\
+  i5 = _mm_unpackhi_epi32(i5, t1);\
+  o2 = _mm_unpacklo_epi32(o2, t4);\
+  i7 = _mm_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+/* transform round constants into grsiVPERM mode */
+#define grsiVPERM_Transform_RoundConst_CNT2(i, j){\
+  xmm0 = grsiROUND_CONST_P[i];\
+  xmm1 = grsiROUND_CONST_P[j];\
+  xmm2 = grsiROUND_CONST_Q[i];\
+  xmm3 = grsiROUND_CONST_Q[j];\
+  grsiVPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm2 = _mm_xor_si128(xmm2, (grsiALL_15));\
+  xmm3 = _mm_xor_si128(xmm3, (grsiALL_15));\
+  grsiROUND_CONST_P[i] = xmm0;\
+  grsiROUND_CONST_P[j] = xmm1;\
+  grsiROUND_CONST_Q[i] = xmm2;\
+  grsiROUND_CONST_Q[j] = xmm3;\
+}/**/
+
+/* transform round constants into grsiVPERM mode */
+#define grsiVPERM_Transform_RoundConst(){\
+  grsiVPERM_Transform_RoundConst_CNT2(0, 1);\
+  grsiVPERM_Transform_RoundConst_CNT2(2, 3);\
+  grsiVPERM_Transform_RoundConst_CNT2(4, 5);\
+  grsiVPERM_Transform_RoundConst_CNT2(6, 7);\
+  grsiVPERM_Transform_RoundConst_CNT2(8, 9);\
+  grsiVPERM_Transform_RoundConst_CNT2(10, 11);\
+  grsiVPERM_Transform_RoundConst_CNT2(12, 13);\
+  xmm0 = grsiALL_FF;\
+  grsiVPERM_Transform(xmm0, xmm1, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (grsiALL_15));\
+  grsiALL_FF = xmm0;\
+}/**/
+
+
+IFUN void grsiINIT(u64* h)
+#if !defined(DECLARE_IFUN)
+;
+#else
+{
+   __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* transform round constants into grsiVPERM mode */
+  grsiVPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+#endif
+
+IFUN void grsiTF1024(u64* h, u64* m)
+#if !defined(DECLARE_IFUN)
+;
+#else
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+  static __m128i QTEMP[8];
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  grsiROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  grsiROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  return;
+}
+#endif
+
+IFUN void grsiOF1024(u64* h)
+#if !defined(DECLARE_IFUN)
+;
+#else
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  grsiROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
+  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
+  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
+  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  grsiMatrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+  grsiVPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, grsiVPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+#endif
+
--- a/algo/groestl/sse2/grsi.c
+++ b/algo/groestl/sse2/grsi.c
@@ -0,0 +1,273 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#include "grsi.h"
+#include "grsi-asm.h"
+
+/* void grsiInit(grsiState* ctx) { */
+#define GRS_I \
+do { \
+  grsiState *ctx = &sts_grs; \
+  u8 i = 0; \
+ \
+  /* set number of state columns and state size depending on \
+     variant */ \
+  ctx->grsicolumns = grsiCOLS; \
+  ctx->grsistatesize = grsiSIZE; \
+    ctx->grsiv = LONG; \
+ \
+   grsiSET_CONSTANTS();  \
+ \
+  memset(ctx->grsichaining, 0, sizeof(u64)*grsiSIZE/8); \
+  memset(ctx->grsibuffer, 0, sizeof(grsiBitSequence)*grsiSIZE); \
+ \
+  if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL) \
+    return;  \
+ \
+  /* set initial value */ \
+  ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH); \
+ \
+  grsiINIT(ctx->grsichaining); \
+ \
+  /* set other variables */ \
+  ctx->grsibuf_ptr = 0; \
+  ctx->grsiblock_counter = 0; \
+  ctx->grsibits_in_last_byte = 0; \
+ \
+} while (0) 
+
+/* digest up to len bytes of input (full blocks only) */
+void grsiTransform(grsiState *ctx,
+	       const u8 *in, 
+	       unsigned long long len) {
+
+    /* increment block counter */
+    ctx->grsiblock_counter += len/grsiSIZE;
+
+    /* digest message, one block at a time */
+    for (; len >= grsiSIZE; len -= grsiSIZE, in += grsiSIZE)
+      grsiTF1024((u64*)ctx->grsichaining, (u64*)in);
+
+    asm volatile ("emms");
+}
+
+/* given state h, do h <- P(h)+h */
+void grsiOutputTransformation(grsiState *ctx) {
+
+    /* determine variant */
+    grsiOF1024((u64*)ctx->grsichaining);
+
+    asm volatile ("emms");
+}
+
+/* initialise context */
+void grsiInit(grsiState* ctx) {
+  u8 i = 0;
+
+  /* output size (in bits) must be a positive integer less than or
+     equal to 512, and divisible by 8 */
+  if (grsiLENGTH <= 0 || (grsiLENGTH%8) || grsiLENGTH > 512)
+    return; 
+
+  /* set number of state columns and state size depending on
+     variant */
+  ctx->grsicolumns = grsiCOLS;
+  ctx->grsistatesize = grsiSIZE;
+    ctx->grsiv = LONG;
+
+  grsiSET_CONSTANTS();
+
+  for (i=0; i<grsiSIZE/8; i++)
+    ctx->grsichaining[i] = 0;
+  for (i=0; i<grsiSIZE; i++)
+    ctx->grsibuffer[i] = 0;
+
+  if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL)
+    return; 
+
+  /* set initial value */
+  ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH);
+
+  grsiINIT(ctx->grsichaining);
+
+  /* set other variables */
+  ctx->grsibuf_ptr = 0;
+  ctx->grsiblock_counter = 0;
+  ctx->grsibits_in_last_byte = 0;
+
+  return;
+}
+
+/* update state with databitlen bits of input */
+void grsiUpdate(grsiState* ctx,
+		  const grsiBitSequence* input,
+		  grsiDataLength databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* non-integral number of message bytes can only be supplied in the
+     last call to this function */
+  if (ctx->grsibits_in_last_byte) return;
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+  if (ctx->grsibuf_ptr) {
+    while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) {
+      ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
+    }
+    if (ctx->grsibuf_ptr < ctx->grsistatesize) {
+      /* buffer still not full, return */
+      if (rem) {
+        ctx->grsibits_in_last_byte = rem;
+        ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
+      }
+      return;
+    }
+
+    /* digest buffer */
+    ctx->grsibuf_ptr = 0;
+    printf("error\n");
+    grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
+  }
+
+  /* digest bulk of message */
+  grsiTransform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+     remaining bits in last byte, together with information about
+     number of bits */
+  if (rem) {
+    ctx->grsibits_in_last_byte = rem;
+    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
+  }
+  return; 
+}
+
+/* update state with databitlen bits of input */
+void grsiUpdateq(grsiState* ctx, const grsiBitSequence* input)
+{
+  grsiDataLength databitlen= 64*8;
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* non-integral number of message bytes can only be supplied in the
+     last call to this function */
+  if (ctx->grsibits_in_last_byte) return;
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+  if (ctx->grsibuf_ptr) {
+    while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) {
+      ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
+    }
+    if (ctx->grsibuf_ptr < ctx->grsistatesize) {
+      /* buffer still not full, return */
+      if (rem) {
+        ctx->grsibits_in_last_byte = rem;
+        ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
+      }
+      return;
+    }
+
+    /* digest buffer */
+    ctx->grsibuf_ptr = 0;
+    printf("error\n");
+    grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
+  }
+
+  /* digest bulk of message */
+  grsiTransform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+     remaining bits in last byte, together with information about
+     number of bits */
+  if (rem) {
+    ctx->grsibits_in_last_byte = rem;
+    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
+  }
+  return; 
+}
+
+#define BILB ctx->grsibits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+void grsiFinal(grsiState* ctx,
+		 grsiBitSequence* output) {
+  int i, j = 0, grsibytelen = grsiLENGTH/8;
+  u8 *s = (grsiBitSequence*)ctx->grsichaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+    ctx->grsibuffer[(int)ctx->grsibuf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+    ctx->grsibuffer[(int)ctx->grsibuf_ptr-1] ^= 0x1<<(7-BILB);
+    BILB = 0;
+  }
+  else ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) {
+    /* padding requires two blocks */
+    while (ctx->grsibuf_ptr < ctx->grsistatesize) {
+      ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0;
+    }
+    /* digest first padding block */
+    grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
+    ctx->grsibuf_ptr = 0;
+  }
+  while (ctx->grsibuf_ptr < ctx->grsistatesize-grsiLENGTHFIELDLEN) {
+    ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->grsiblock_counter++;
+  ctx->grsibuf_ptr = ctx->grsistatesize;
+  while (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) {
+    ctx->grsibuffer[(int)--ctx->grsibuf_ptr] = (u8)ctx->grsiblock_counter;
+    ctx->grsiblock_counter >>= 8;
+  }
+
+  /* digest final padding block */
+  grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
+  /* perform output transformation */
+  grsiOutputTransformation(ctx);
+
+  /* store hash result in output */
+  for (i = ctx->grsistatesize-grsibytelen; i < ctx->grsistatesize; i++,j++) {
+    output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  
+  for (i = 0; i < ctx->grsicolumns; i++) {
+    ctx->grsichaining[i] = 0;
+  }
+  
+  for (i = 0; i < ctx->grsistatesize; i++) {
+    ctx->grsibuffer[i] = 0;
+  }
+//  free(ctx->grsichaining);
+//  free(ctx->grsibuffer);
+
+  return; 
+}
+
--- a/algo/groestl/sse2/grsi.h
+++ b/algo/groestl/sse2/grsi.h
@@ -0,0 +1,79 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __grsi_h
+#define __grsi_h
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "brg_types.h"
+
+#define grsiLENGTH 512
+
+/* some sizes (number of bytes) */
+#define grsiROWS 8
+#define grsiLENGTHFIELDLEN grsiROWS
+#define grsiCOLS512 8
+#define grsiCOLS1024 16
+#define grsiSIZE512 (grsiROWS*grsiCOLS512)
+#define grsiSIZE1024 (grsiROWS*grsiCOLS1024)
+#define grsiROUNDS512 10
+#define grsiROUNDS1024 14
+
+#if grsiLENGTH<=256
+#define grsiCOLS grsiCOLS512
+#define grsiSIZE grsiSIZE512
+#define grsiROUNDS grsiROUNDS512
+#else
+#define grsiCOLS grsiCOLS1024
+#define grsiSIZE grsiSIZE1024
+#define grsiROUNDS grsiROUNDS1024
+#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define grsiU64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define grsiU64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef enum { LONG, SHORT } grsiVar;
+
+/* NIST API begin */
+typedef unsigned char grsiBitSequence;
+typedef unsigned long long grsiDataLength;
+typedef struct {
+  __attribute__ ((aligned (32))) u64 grsichaining[grsiSIZE/8];      /* actual state */
+  __attribute__ ((aligned (32))) grsiBitSequence grsibuffer[grsiSIZE];  /* data buffer */
+  u64 grsiblock_counter;        /* message block counter */
+  int grsibuf_ptr;              /* data buffer pointer */
+  int grsibits_in_last_byte;    /* no. of message bits in last byte of
+                               data buffer */
+  int grsicolumns;              /* no. of columns in state */
+  int grsistatesize;            /* total no. of bytes in state */
+  grsiVar grsiv;                    /* LONG or SHORT */
+} grsiState;
+
+void grsiInit(grsiState*);
+void grsiUpdate(grsiState*, const grsiBitSequence*, grsiDataLength);
+void grsiFinal(grsiState*, grsiBitSequence*);
+/* NIST API end   */
+
+#endif /* __hash_h */
--- a/algo/groestl/sse2/grsn-asm.h
+++ b/algo/groestl/sse2/grsn-asm.h
--- a/algo/groestl/sse2/grsn.c
+++ b/algo/groestl/sse2/grsn.c
@@ -0,0 +1,247 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#include "grsn-asm.h"
+
+/* digest up to len bytes of input (full blocks only) */
+void grsnTransform(grsnState *ctx,
+	       const u8 *in, 
+	       unsigned long long len) {
+
+    /* increment block counter */
+    ctx->block_counter += len/grsnSIZE;
+
+    /* digest message, one block at a time */
+    for (; len >= grsnSIZE; len -= grsnSIZE, in += grsnSIZE)
+#if grsnLENGTH<=256
+      TF512((u64*)ctx->chaining, (u64*)in);
+#else
+      TF1024((u64*)ctx->chaining, (u64*)in);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* given state h, do h <- P(h)+h */
+void grsnOutputTransformation(grsnState *ctx) {
+
+    /* determine variant */
+#if (grsnLENGTH <= 256)
+    OF512((u64*)ctx->chaining);
+#else
+    OF1024((u64*)ctx->chaining);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* initialise context */
+void grsnInit(grsnState* ctx) {
+  u8 i = 0;
+
+  /* output size (in bits) must be a positive integer less than or
+     equal to 512, and divisible by 8 */
+  if (grsnLENGTH <= 0 || (grsnLENGTH%8) || grsnLENGTH > 512)
+    return; 
+
+  /* set number of state columns and state size depending on
+     variant */
+  ctx->columns = grsnCOLS;
+  ctx->statesize = grsnSIZE;
+#if (grsnLENGTH <= 256)
+    ctx->v = SHORT;
+#else
+    ctx->v = LONG;
+#endif
+
+  SET_CONSTANTS();
+
+  for (i=0; i<grsnSIZE/8; i++)
+    ctx->chaining[i] = 0;
+  for (i=0; i<grsnSIZE; i++)
+    ctx->buffer[i] = 0;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return; 
+
+  /* set initial value */
+  ctx->chaining[ctx->columns-1] = U64BIG((u64)grsnLENGTH);
+
+  INIT(ctx->chaining);
+
+  /* set other variables */
+  ctx->buf_ptr = 0;
+  ctx->block_counter = 0;
+  ctx->bits_in_last_byte = 0;
+
+  return;
+}
+
+/* update state with databitlen bits of input */
+void grsnUpdate(grsnState* ctx,
+		  const BitSequence* input,
+		  DataLength databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* non-integral number of message bytes can only be supplied in the
+     last call to this function */
+  if (ctx->bits_in_last_byte) return; 
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+  if (ctx->buf_ptr) {
+    while (ctx->buf_ptr < ctx->statesize && index < msglen) {
+      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+    }
+    if (ctx->buf_ptr < ctx->statesize) {
+      /* buffer still not full, return */
+      if (rem) {
+        ctx->bits_in_last_byte = rem;
+        ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+      }
+      return;
+    }
+
+    /* digest buffer */
+    ctx->buf_ptr = 0;
+    printf("error\n");
+    grsnTransform(ctx, ctx->buffer, ctx->statesize);
+  }
+
+  /* digest bulk of message */
+  grsnTransform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->statesize)*ctx->statesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+     remaining bits in last byte, together with information about
+     number of bits */
+  if (rem) {
+    ctx->bits_in_last_byte = rem;
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+  }
+  return;
+}
+
+/* update state with databitlen bits of input */
+void grsnUpdateq(grsnState* ctx, const BitSequence* input)
+{
+  int index = 0;
+  int msglen = (int)((64*8)/8);
+  int rem = (int)((64*8)%8);
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+  if (ctx->buf_ptr) {
+    while (ctx->buf_ptr < ctx->statesize && index < msglen) {
+      ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+    }
+    if (ctx->buf_ptr < ctx->statesize) {
+      /* buffer still not full, return */
+      if (rem) {
+        ctx->bits_in_last_byte = rem;
+        ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+      }
+      return;
+    }
+
+    /* digest buffer */
+    ctx->buf_ptr = 0;
+    printf("error\n");
+    grsnTransform(ctx, ctx->buffer, ctx->statesize);
+  }
+
+  /* digest bulk of message */
+  grsnTransform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->statesize)*ctx->statesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+     remaining bits in last byte, together with information about
+     number of bits */
+  if (rem) {
+    ctx->bits_in_last_byte = rem;
+    ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+  }
+  return;
+}
+
+#define BILB ctx->bits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+void grsnFinal(grsnState* ctx,
+		 BitSequence* output) {
+  int i, j = 0, grsnbytelen = grsnLENGTH/8;
+  u8 *s = (BitSequence*)ctx->chaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+    ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+    ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
+    BILB = 0;
+  }
+  else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) {
+    /* padding requires two blocks */
+    while (ctx->buf_ptr < ctx->statesize) {
+      ctx->buffer[(int)ctx->buf_ptr++] = 0;
+    }
+    /* digest first padding block */
+    grsnTransform(ctx, ctx->buffer, ctx->statesize);
+    ctx->buf_ptr = 0;
+  }
+  while (ctx->buf_ptr < ctx->statesize-grsnLENGTHFIELDLEN) {
+    ctx->buffer[(int)ctx->buf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->block_counter++;
+  ctx->buf_ptr = ctx->statesize;
+  while (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) {
+    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
+    ctx->block_counter >>= 8;
+  }
+
+  /* digest final padding block */
+  grsnTransform(ctx, ctx->buffer, ctx->statesize);
+  /* perform output transformation */
+  grsnOutputTransformation(ctx);
+
+  /* store hash result in output */
+  for (i = ctx->statesize-grsnbytelen; i < ctx->statesize; i++,j++) {
+    output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  
+  for (i = 0; i < ctx->columns; i++) {
+    ctx->chaining[i] = 0;
+  }
+  
+  for (i = 0; i < ctx->statesize; i++) {
+    ctx->buffer[i] = 0;
+  }
+//  free(ctx->chaining);
+//  free(ctx->buffer);
+
+  return;
+}
+
--- a/algo/groestl/sse2/grsn.h
+++ b/algo/groestl/sse2/grsn.h
@@ -0,0 +1,80 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __grsn_h
+#define __grsn_h
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "brg_types.h"
+
+#ifndef grsnLENGTH
+#define grsnLENGTH 512
+#endif
+
+/* some sizes (number of bytes) */
+#define grsnROWS 8
+#define grsnLENGTHFIELDLEN grsnROWS
+#define grsnCOLS512 8
+#define grsnCOLS1024 16
+#define grsnSIZE512 (grsnROWS*grsnCOLS512)
+#define grsnSIZE1024 (grsnROWS*grsnCOLS1024)
+#define grsnROUNDS512 10
+#define grsnROUNDS1024 14
+
+#if grsnLENGTH<=256
+#define grsnCOLS grsnCOLS512
+#define grsnSIZE grsnSIZE512
+#define grsnROUNDS grsnROUNDS512
+#else
+#define grsnCOLS grsnCOLS1024
+#define grsnSIZE grsnSIZE1024
+#define grsnROUNDS grsnROUNDS1024
+#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef enum { LONG, SHORT } Var;
+
+/* NIST API begin */
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+typedef struct {
+  __attribute__ ((aligned (32))) u64 chaining[grsnSIZE/8];      /* actual state */
+  __attribute__ ((aligned (32))) BitSequence buffer[grsnSIZE];  /* data buffer */
+  u64 block_counter;        /* message block counter */
+  int buf_ptr;              /* data buffer pointer */
+  int bits_in_last_byte;    /* no. of message bits in last byte of
+                               data buffer */
+  int columns;              /* no. of columns in state */
+  int statesize;            /* total no. of bytes in state */
+  Var v;                    /* LONG or SHORT */
+} grsnState;
+
+void grsnInit(grsnState*);
+void grsnUpdate(grsnState*, const BitSequence*, DataLength);
+void grsnFinal(grsnState*, BitSequence*);
+
+#endif /* __hash_h */
--- a/algo/groestl/sse2/grso-asm.c
+++ b/algo/groestl/sse2/grso-asm.c
--- a/algo/groestl/sse2/grso-asm.h
+++ b/algo/groestl/sse2/grso-asm.h
@@ -0,0 +1,10 @@
+#ifndef GRSOASM_H
+#define GRSOASM_H
+
+#include "grso.h"
+
+void grsoP1024ASM (u64 *x) ;
+
+void grsoQ1024ASM (u64 *x) ;
+
+#endif 
--- a/algo/groestl/sse2/grso-asm2.c
+++ b/algo/groestl/sse2/grso-asm2.c
--- a/algo/groestl/sse2/grso-asm2.h
+++ b/algo/groestl/sse2/grso-asm2.h
@@ -0,0 +1,11 @@
+#ifndef GRSOASM_H
+#define GRSOASM_H
+/* really same as the mmx asm.h */
+/* made just in case something must be changed */
+#include "grso.h"
+
+void grsoP1024ASM (u64 *x) ;
+
+void grsoQ1024ASM (u64 *x) ;
+
+#endif 
--- a/algo/groestl/sse2/grso-macro.c
+++ b/algo/groestl/sse2/grso-macro.c
@@ -0,0 +1,110 @@
+/* hash.c     January 2011
+ *
+ * Groestl-512 implementation with inline assembly containing mmx and
+ * sse instructions. Optimized for Opteron.
+ * Authors: Krystian Matusiewicz and Soeren S. Thomsen
+ *
+ * This code is placed in the public domain
+ */
+
+//#include "grso.h"
+//#include "grso-asm.h"
+// #include "grsotab.h"
+
+#define DECL_GRS
+
+/* load initial constants */
+#define GRS_I \
+do { \
+  int i; \
+  /* set initial value */ \
+  for (i = 0; i < grsoCOLS-1; i++) sts_grs.grsstate[i] = 0; \
+  sts_grs.grsstate[grsoCOLS-1] = grsoU64BIG((u64)(8*grsoDIGESTSIZE)); \
+ \
+  /* set other variables */ \
+  sts_grs.grsbuf_ptr = 0; \
+  sts_grs.grsblock_counter = 0; \
+} while (0); \
+
+/* load hash */
+#define GRS_U \
+do { \
+    unsigned char* in = hash; \
+  unsigned long long index = 0; \
+ \
+  /* if the buffer contains data that has not yet been digested, first \
+     add data to buffer until full */ \
+  if (sts_grs.grsbuf_ptr) { \
+    while (sts_grs.grsbuf_ptr < grsoSIZE && index < 64) { \
+      hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
+    } \
+    if (sts_grs.grsbuf_ptr < grsoSIZE) continue; \
+ \
+    /* digest buffer */ \
+    sts_grs.grsbuf_ptr = 0; \
+    grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
+  } \
+ \
+  /* digest bulk of message */ \
+  grsoTransform(&sts_grs, in+index, 64-index); \
+  index += ((64-index)/grsoSIZE)*grsoSIZE; \
+ \
+  /* store remaining data in buffer */ \
+  while (index < 64) { \
+    hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
+  } \
+ \
+} while (0);
+
+/* groestl512 hash loaded */
+/* hash = groestl512(loaded) */
+#define GRS_C \
+do { \
+    char *out = hash; \
+  int i, j = 0; \
+  unsigned char *s = (unsigned char*)sts_grs.grsstate; \
+ \
+  hashbuf[sts_grs.grsbuf_ptr++] = 0x80; \
+ \
+  /* pad with '0'-bits */ \
+  if (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
+    /* padding requires two blocks */ \
+    while (sts_grs.grsbuf_ptr < grsoSIZE) { \
+      hashbuf[sts_grs.grsbuf_ptr++] = 0; \
+    } \
+    /* digest first padding block */ \
+    grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
+    sts_grs.grsbuf_ptr = 0; \
+  } \
+  while (sts_grs.grsbuf_ptr < grsoSIZE-grsoLENGTHFIELDLEN) { \
+    hashbuf[sts_grs.grsbuf_ptr++] = 0; \
+  } \
+ \
+  /* length padding */ \
+  sts_grs.grsblock_counter++; \
+  sts_grs.grsbuf_ptr = grsoSIZE; \
+  while (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
+    hashbuf[--sts_grs.grsbuf_ptr] = (unsigned char)sts_grs.grsblock_counter; \
+    sts_grs.grsblock_counter >>= 8; \
+  } \
+ \
+  /* digest final padding block */ \
+  grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
+  /* perform output transformation */ \
+  grsoOutputTransformation(&sts_grs); \
+ \
+  /* store hash result in output */ \
+  for (i = grsoSIZE-grsoDIGESTSIZE; i < grsoSIZE; i++,j++) { \
+    out[j] = s[i]; \
+  } \
+ \
+  /* zeroise relevant variables and deallocate memory */ \
+  for (i = 0; i < grsoCOLS; i++) { \
+    sts_grs.grsstate[i] = 0; \
+  } \
+  for (i = 0; i < grsoSIZE; i++) { \
+    hashbuf[i] = 0; \
+  } \
+} while (0); 
+ 
+
--- a/algo/groestl/sse2/grso.c
+++ b/algo/groestl/sse2/grso.c
@@ -0,0 +1,57 @@
+/* hash.c     January 2011
+ *
+ * Groestl-512 implementation with inline assembly containing mmx and
+ * sse instructions. Optimized for Opteron.
+ * Authors: Krystian Matusiewicz and Soeren S. Thomsen
+ *
+ * This code is placed in the public domain
+ */
+
+#include "algo/groestl/sse2/grso-asm.h"
+#include "algo/groestl/sse2/grso.h"
+#include "algo/groestl/sse2/grsotab.h"
+
+/* digest up to len bytes of input (full blocks only) */
+void grsoTransform(grsoState *ctx, 
+	       const unsigned char *in, 
+	       unsigned long long len) {
+  u64 y[grsoCOLS+2] __attribute__ ((aligned (16)));
+  u64 z[grsoCOLS+2] __attribute__ ((aligned (16)));
+  u64 *m, *h = (u64*)ctx->grsstate;
+  int i;
+  
+  /* increment block counter */
+  ctx->grsblock_counter += len/grsoSIZE;
+  
+  /* digest message, one block at a time */
+  for (; len >= grsoSIZE; len -= grsoSIZE, in += grsoSIZE) {
+    m = (u64*)in;
+    for (i = 0; i < grsoCOLS; i++) {
+      y[i] = m[i];
+      z[i] = m[i] ^ h[i];
+    }
+
+    grsoQ1024ASM(y);
+    grsoP1024ASM(z);
+
+    /* h' == h + Q(m) + P(h+m) */
+    for (i = 0; i < grsoCOLS; i++) {
+      h[i] ^= z[i] ^ y[i];
+    }
+  }
+}
+
+/* given state h, do h <- P(h)+h */
+void grsoOutputTransformation(grsoState *ctx) {
+  u64 z[grsoCOLS] __attribute__ ((aligned (16)));
+  int j;
+
+  for (j = 0; j < grsoCOLS; j++) {
+    z[j] = ctx->grsstate[j];
+  }
+  grsoP1024ASM(z);
+  for (j = 0; j < grsoCOLS; j++) {
+    ctx->grsstate[j] ^= z[j];
+  }
+}
+
--- a/algo/groestl/sse2/grso.h
+++ b/algo/groestl/sse2/grso.h
@@ -0,0 +1,62 @@
+#ifndef __hash_h
+#define __hash_h
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "brg_endian.h"
+#include "brg_types.h"
+
+/* some sizes (number of bytes) */
+#define grsoROWS 8
+#define grsoLENGTHFIELDLEN grsoROWS
+#define grsoCOLS 16
+#define grsoSIZE (grsoROWS*grsoCOLS)
+#define grsoDIGESTSIZE 64
+
+#define grsoROUNDS 14
+
+#define grsoROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&((u64)0xffffffffffffffffULL))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#error
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define grsoU64BIG(a)				\
+  ((grsoROTL64(a, 8) & ((u64)0x000000ff000000ffULL)) |	\
+   (grsoROTL64(a,24) & ((u64)0x0000ff000000ff00ULL)) |	\
+   (grsoROTL64(a,40) & ((u64)0x00ff000000ff0000ULL)) |	\
+   (grsoROTL64(a,56) & ((u64)0xff000000ff000000ULL)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef struct {
+  u64 grsstate[grsoCOLS];             /* actual state */
+  u64 grsblock_counter;           /* message block counter */
+  int grsbuf_ptr;                 /* data buffer pointer */
+} grsoState;
+
+//extern int grsoInit(grsoState* ctx); 
+//extern int grsoUpdate(grsoState* ctx, const unsigned char* in,
+//	   unsigned long long len);
+//extern int grsoUpdateq(grsoState* ctx, const unsigned char* in);
+//extern int grsoFinal(grsoState* ctx,
+//	  unsigned char* out); 
+//
+//extern int grsohash(unsigned char *out,
+//		const unsigned char *in,
+//		unsigned long long len);
+
+/* digest up to len bytes of input (full blocks only) */
+void grsoTransform( grsoState *ctx, const unsigned char *in,
+                            unsigned long long len );
+
+/* given state h, do h <- P(h)+h */
+void grsoOutputTransformation( grsoState *ctx );
+
+int grso_init ( grsoState* sts_grs );
+int grso_update ( grsoState* sts_grs, char* hashbuf, char* hash );
+int grso_close ( grsoState *sts_grs, char* hashbuf, char* hash );
+
+
+#endif /* __hash_h */
--- a/algo/groestl/sse2/grsotab.h
+++ b/algo/groestl/sse2/grsotab.h
--- a/algo/groestl/sse2/grss.c
+++ b/algo/groestl/sse2/grss.c
--- a/algo/groestl/sse2/grss_api.h
+++ b/algo/groestl/sse2/grss_api.h
@@ -0,0 +1,45 @@
+/*
+ * file        : hash_api.h
+ * version     : 1.0.208
+ * date        : 14.12.2010
+ * 
+ * Grostl multi-stream bitsliced implementation Hash API
+ *
+ * Cagdas Calik
+ * ccalik@metu.edu.tr
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey.
+ *
+ */
+
+#ifndef GRSS_API_H
+#define GRSS_API_H
+
+#include "sha3_common.h"
+#include <tmmintrin.h>
+
+typedef struct
+{
+	__m128i state1[8];
+	__m128i state2[8];
+	__m128i state3[8];
+	__m128i state4[8];
+
+	__m128i _Pconst[14][8];
+	__m128i	_Qconst[14][8];
+	__m128i	_shiftconst[8];
+
+	unsigned int uHashLength;
+	unsigned int uBlockLength;
+
+	BitSequence buffer[128];
+
+} grssState;
+
+void grssInit(grssState *state, int grssbitlen);
+
+void grssUpdate(grssState *state, const BitSequence *data, DataLength databitlen);
+
+void grssFinal(grssState *state, BitSequence *grssval);
+
+#endif // HASH_API_H
+
--- a/algo/groestl/sse2/grstab.h
+++ b/algo/groestl/sse2/grstab.h
--- a/algo/groestl/sse2/grsv-asm.h
+++ b/algo/groestl/sse2/grsv-asm.h
--- a/algo/groestl/sse2/grsv.c
+++ b/algo/groestl/sse2/grsv.c
@@ -0,0 +1,202 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+
+#include "grsv.h"
+#include "grsv-asm.h"
+
+/* digest up to len bytes of input (full blocks only) */
+void grsvTransform(grsvState *ctx,
+	       const u8 *in, 
+	       unsigned long long len) {
+
+    /* increment block counter */
+    ctx->grsvblock_counter += len/grsvSIZE;
+
+    /* digest message, one block at a time */
+    for (; len >= grsvSIZE; len -= grsvSIZE, in += grsvSIZE)
+#if grsvLENGTH<=256
+      grsvTF512((u64*)ctx->grsvchaining, (u64*)in);
+#else
+      grsvTF1024((u64*)ctx->grsvchaining, (u64*)in);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* given state h, do h <- P(h)+h */
+void grsvOutputTransformation(grsvState *ctx) {
+
+    /* determine variant */
+#if (grsvLENGTH <= 256)
+    grsvOF512((u64*)ctx->grsvchaining);
+#else
+    grsvOF1024((u64*)ctx->grsvchaining);
+#endif
+
+    asm volatile ("emms");
+}
+
+/* initialise context */
+void grsvInit(grsvState* ctx) {
+  u8 i = 0;
+
+  /* output size (in bits) must be a positive integer less than or
+     equal to 512, and divisible by 8 */
+  if (grsvLENGTH <= 0 || (grsvLENGTH%8) || grsvLENGTH > 512)
+    return;
+
+  /* set number of state columns and state size depending on
+     variant */
+  ctx->grsvcolumns = grsvCOLS;
+  ctx->grsvstatesize = grsvSIZE;
+#if (grsvLENGTH <= 256)
+    ctx->grsvv = SHORT;
+#else
+    ctx->grsvv = LONG;
+#endif
+
+  SET_CONSTANTS();
+
+  for (i=0; i<grsvSIZE/8; i++)
+    ctx->grsvchaining[i] = 0;
+  for (i=0; i<grsvSIZE; i++)
+    ctx->grsvbuffer[i] = 0;
+
+  if (ctx->grsvchaining == NULL || ctx->grsvbuffer == NULL)
+    return;
+
+  /* set initial value */
+  ctx->grsvchaining[ctx->grsvcolumns-1] = U64BIG((u64)grsvLENGTH);
+
+  grsvINIT(ctx->grsvchaining);
+
+  /* set other variables */
+  ctx->grsvbuf_ptr = 0;
+  ctx->grsvblock_counter = 0;
+  ctx->grsvbits_in_last_byte = 0;
+
+  return; 
+}
+
+/* update state with databitlen bits of input */
+void grsvUpdate(grsvState* ctx,
+		  const grsvBitSequence* input,
+		  grsvDataLength databitlen) {
+  int index = 0;
+  int msglen = (int)(databitlen/8);
+  int rem = (int)(databitlen%8);
+
+  /* non-integral number of message bytes can only be supplied in the
+     last call to this function */
+  if (ctx->grsvbits_in_last_byte) return;
+
+  /* if the buffer contains data that has not yet been digested, first
+     add data to buffer until full */
+  if (ctx->grsvbuf_ptr) {
+    while (ctx->grsvbuf_ptr < ctx->grsvstatesize && index < msglen) {
+      ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++];
+    }
+    if (ctx->grsvbuf_ptr < ctx->grsvstatesize) {
+      /* buffer still not full, return */
+      if (rem) {
+        ctx->grsvbits_in_last_byte = rem;
+        ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index];
+      }
+      return; 
+    }
+
+    /* digest buffer */
+    ctx->grsvbuf_ptr = 0;
+    printf("error\n");
+    grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
+  }
+
+  /* digest bulk of message */
+  grsvTransform(ctx, input+index, msglen-index);
+  index += ((msglen-index)/ctx->grsvstatesize)*ctx->grsvstatesize;
+
+  /* store remaining data in buffer */
+  while (index < msglen) {
+    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++];
+  }
+
+  /* if non-integral number of bytes have been supplied, store
+     remaining bits in last byte, together with information about
+     number of bits */
+  if (rem) {
+    ctx->grsvbits_in_last_byte = rem;
+    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index];
+  }
+  return;
+}
+
+#define BILB ctx->grsvbits_in_last_byte
+
+/* finalise: process remaining data (including padding), perform
+   output transformation, and write hash result to 'output' */
+void grsvFinal(grsvState* ctx,
+		 grsvBitSequence* output) {
+  int i, j = 0, grsvbytelen = grsvLENGTH/8;
+  u8 *s = (grsvBitSequence*)ctx->grsvchaining;
+
+  /* pad with '1'-bit and first few '0'-bits */
+  if (BILB) {
+    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
+    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr-1] ^= 0x1<<(7-BILB);
+    BILB = 0;
+  }
+  else ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0x80;
+
+  /* pad with '0'-bits */
+  if (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
+    /* padding requires two blocks */
+    while (ctx->grsvbuf_ptr < ctx->grsvstatesize) {
+      ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0;
+    }
+    /* digest first padding block */
+    grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
+    ctx->grsvbuf_ptr = 0;
+  }
+  while (ctx->grsvbuf_ptr < ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
+    ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0;
+  }
+
+  /* length padding */
+  ctx->grsvblock_counter++;
+  ctx->grsvbuf_ptr = ctx->grsvstatesize;
+  while (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
+    ctx->grsvbuffer[(int)--ctx->grsvbuf_ptr] = (u8)ctx->grsvblock_counter;
+    ctx->grsvblock_counter >>= 8;
+  }
+
+  /* digest final padding block */
+  grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
+  /* perform output transformation */
+  grsvOutputTransformation(ctx);
+
+  /* store hash result in output */
+  for (i = ctx->grsvstatesize-grsvbytelen; i < ctx->grsvstatesize; i++,j++) {
+    output[j] = s[i];
+  }
+
+  /* zeroise relevant variables and deallocate memory */
+  
+  for (i = 0; i < ctx->grsvcolumns; i++) {
+    ctx->grsvchaining[i] = 0;
+  }
+  
+  for (i = 0; i < ctx->grsvstatesize; i++) {
+    ctx->grsvbuffer[i] = 0;
+  }
+//  free(ctx->grsvchaining);
+//  free(ctx->buffer);
+
+  return;
+}
+
--- a/algo/groestl/sse2/grsv.h
+++ b/algo/groestl/sse2/grsv.h
@@ -0,0 +1,77 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __grsv_h
+#define __grsv_h
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "brg_types.h"
+
+#define grsvLENGTH 512
+
+/* some sizes (number of bytes) */
+#define grsvROWS 8
+#define grsvLENGTHFIELDLEN grsvROWS
+#define grsvCOLS512 8
+#define grsvCOLS1024 16
+#define grsvSIZE512 (grsvROWS*grsvCOLS512)
+#define grsvSIZE1024 (grsvROWS*grsvCOLS1024)
+#define grsvROUNDS512 10
+#define grsvROUNDS1024 14
+
+#if grsvLENGTH<=256
+#define grsvCOLS grsvCOLS512
+#define grsvSIZE grsvSIZE512
+#define grsvROUNDS grsvROUNDS512
+#else
+#define grsvCOLS grsvCOLS1024
+#define grsvSIZE grsvSIZE1024
+#define grsvROUNDS grsvROUNDS1024
+#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef enum { LONG, SHORT } grsvVar;
+
+typedef unsigned char grsvBitSequence;
+typedef unsigned long long grsvDataLength;
+typedef struct {
+  __attribute__ ((aligned (32))) u64 grsvchaining[grsvSIZE/8];      /* actual state */
+  __attribute__ ((aligned (32))) grsvBitSequence grsvbuffer[grsvSIZE];  /* data buffer */
+  u64 grsvblock_counter;        /* message block counter */
+  int grsvbuf_ptr;              /* data buffer pointer */
+  int grsvbits_in_last_byte;    /* no. of message bits in last byte of
+                               data buffer */
+  int grsvcolumns;              /* no. of columns in state */
+  int grsvstatesize;            /* total no. of bytes in state */
+  grsvVar grsvv;                    /* LONG or SHORT */
+} grsvState;
+
+void grsvInit(grsvState*);
+void grsvUpdate(grsvState*, const grsvBitSequence*, grsvDataLength);
+void grsvFinal(grsvState*, grsvBitSequence*);
+
+#endif /* __grsv_h */