Initial upload v3.4.7

This commit is contained in:
Jay D Dee
2016-09-22 13:16:18 -04:00
parent a3c8079774
commit a35039bc05
480 changed files with 211015 additions and 3 deletions

View File

Binary file not shown.

View File

@@ -0,0 +1,133 @@
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
LICENSE TERMS
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
1. source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
2. binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation;
3. the name of the copyright holder is not used to endorse products
built using this software without specific written permission.
DISCLAIMER
This software is provided 'as is' with no explicit or implied warranties
in respect of its properties, including, but not limited to, correctness
and/or fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
*/
#ifndef _BRG_ENDIAN_H
#define _BRG_ENDIAN_H
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
/* Include files where endian defines and byteswap functions may reside */
#if defined( __sun )
# include <sys/isa_defs.h>
#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
# include <sys/endian.h>
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
# include <machine/endian.h>
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
# if !defined( __MINGW32__ ) && !defined( _AIX )
# include <endian.h>
# if !defined( __BEOS__ )
# include <byteswap.h>
# endif
# endif
#endif
/* Now attempt to set the define for platform byte order using any */
/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
/* seem to encompass most endian symbol definitions */
#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( _BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( _LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
/* if the platform byte order could not be determined, then try to */
/* set this define using common machine defines */
#if !defined(PLATFORM_BYTE_ORDER)
#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \
defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \
defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \
defined( vax ) || defined( vms ) || defined( VMS ) || \
defined( __VMS ) || defined( _M_X64 )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \
defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \
defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \
defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \
defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \
defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \
defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#else
# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
#endif
#endif
#endif

View File

@@ -0,0 +1,231 @@
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
(a few lines added by Soeren S. Thomsen, October 2008)
LICENSE TERMS
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
1. source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
2. binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation;
3. the name of the copyright holder is not used to endorse products
built using this software without specific written permission.
DISCLAIMER
This software is provided 'as is' with no explicit or implied warranties
in respect of its properties, including, but not limited to, correctness
and/or fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
The unsigned integer types defined here are of the form uint_<nn>t where
<nn> is the length of the type; for example, the unsigned 32-bit type is
'uint_32t'. These are NOT the same as the 'C99 integer types' that are
defined in the inttypes.h and stdint.h headers since attempts to use these
types have shown that support for them is still highly variable. However,
since the latter are of the form uint<nn>_t, a regular expression search
and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
can be used to convert the types used here to the C99 standard types.
*/
#ifndef _BRG_TYPES_H
#define _BRG_TYPES_H
#if defined(__cplusplus)
extern "C" {
#endif
#include <limits.h>
#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
# include <stddef.h>
# define ptrint_t intptr_t
#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
# include <stdint.h>
# define ptrint_t intptr_t
#else
# define ptrint_t int
#endif
#ifndef BRG_UI8
# define BRG_UI8
# if UCHAR_MAX == 255u
typedef unsigned char uint_8t;
# else
# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
# endif
#endif
#ifndef BRG_UI16
# define BRG_UI16
# if USHRT_MAX == 65535u
typedef unsigned short uint_16t;
# else
# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
# endif
#endif
#ifndef BRG_UI32
# define BRG_UI32
# if UINT_MAX == 4294967295u
# define li_32(h) 0x##h##u
typedef unsigned int uint_32t;
# elif ULONG_MAX == 4294967295u
# define li_32(h) 0x##h##ul
typedef unsigned long uint_32t;
# elif defined( _CRAY )
# error This code needs 32-bit data types, which Cray machines do not provide
# else
# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
# endif
#endif
#ifndef BRG_UI64
# if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
# define BRG_UI64
# define li_64(h) 0x##h##ui64
typedef unsigned __int64 uint_64t;
# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */
# define BRG_UI64
# define li_64(h) 0x##h##ui64
typedef unsigned __int64 uint_64t;
# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned long long uint_64t;
# elif defined( __MVS__ )
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned int long long uint_64t;
# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
# if UINT_MAX == 18446744073709551615u
# define BRG_UI64
# define li_64(h) 0x##h##u
typedef unsigned int uint_64t;
# endif
# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
# if ULONG_MAX == 18446744073709551615ul
# define BRG_UI64
# define li_64(h) 0x##h##ul
typedef unsigned long uint_64t;
# endif
# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
# if ULLONG_MAX == 18446744073709551615ull
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned long long uint_64t;
# endif
# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
# if ULONG_LONG_MAX == 18446744073709551615ull
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned long long uint_64t;
# endif
# endif
#endif
#if !defined( BRG_UI64 )
# if defined( NEED_UINT_64T )
# error Please define uint_64t as an unsigned 64 bit type in brg_types.h
# endif
#endif
#ifndef RETURN_VALUES
# define RETURN_VALUES
# if defined( DLL_EXPORT )
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
# define VOID_RETURN __declspec( dllexport ) void __stdcall
# define INT_RETURN __declspec( dllexport ) int __stdcall
# elif defined( __GNUC__ )
# define VOID_RETURN __declspec( __dllexport__ ) void
# define INT_RETURN __declspec( __dllexport__ ) int
# else
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
# endif
# elif defined( DLL_IMPORT )
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
# define VOID_RETURN __declspec( dllimport ) void __stdcall
# define INT_RETURN __declspec( dllimport ) int __stdcall
# elif defined( __GNUC__ )
# define VOID_RETURN __declspec( __dllimport__ ) void
# define INT_RETURN __declspec( __dllimport__ ) int
# else
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
# endif
# elif defined( __WATCOMC__ )
# define VOID_RETURN void __cdecl
# define INT_RETURN int __cdecl
# else
# define VOID_RETURN void
# define INT_RETURN int
# endif
#endif
/* These defines are used to detect and set the memory alignment of pointers.
Note that offsets are in bytes.
ALIGN_OFFSET(x,n) return the positive or zero offset of
the memory addressed by the pointer 'x'
from an address that is aligned on an
'n' byte boundary ('n' is a power of 2)
ALIGN_FLOOR(x,n) return a pointer that points to memory
that is aligned on an 'n' byte boundary
and is not higher than the memory address
pointed to by 'x' ('n' is a power of 2)
ALIGN_CEIL(x,n) return a pointer that points to memory
that is aligned on an 'n' byte boundary
and is not lower than the memory address
pointed to by 'x' ('n' is a power of 2)
*/
#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1))
#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
/* These defines are used to declare buffers in a way that allows
faster operations on longer variables to be used. In all these
defines 'size' must be a power of 2 and >= 8. NOTE that the
buffer size is in bytes but the type length is in bits
UNIT_TYPEDEF(x,size) declares a variable 'x' of length
'size' bits
BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize'
bytes defined as an array of variables
each of 'size' bits (bsize must be a
multiple of size / 8)
UNIT_CAST(x,size) casts a variable to a type of
length 'size' bits
UPTR_CAST(x,size) casts a pointer to a pointer to a
varaiable of length 'size' bits
*/
#define UI_TYPE(size) uint_##size##t
#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x
#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)]
#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x))
#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x))
/* Added by Soeren S. Thomsen (begin) */
#define u8 uint_8t
#define u32 uint_32t
#define u64 uint_64t
/* (end) */
#if defined(__cplusplus)
}
#endif
#endif

3119
algo/groestl/sse2/groestl.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,956 @@
/* groestl-intr-vperm.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3 instructions.
* Author: Günther A. Roland, Martin Schläffer
*
* Based on the vperm and aes_ni implementations of the hash function Groestl
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
* Institute of Applied Mathematics, Middle East Technical University, Turkey
*
* This code is placed in the public domain
*/
#include <tmmintrin.h>
#include "grsi.h"
/*define data alignment for different C compilers*/
#if defined(__GNUC__)
#define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
#else
#define DATA_ALIGN16(x) __declspec(align(16)) x
#endif
//#if defined(DECLARE_GLOBAL)
#if 1
#define GLOBAL
#else
#define GLOBAL extern
#endif
//#if defined(DECLARE_IFUN)
#if 1
#define IFUN
#else
#define IFUN extern
#endif
/* global constants */
//GLOBAL __m128i grsiROUND_CONST_Lx;
//GLOBAL __m128i grsiROUND_CONST_L0[grsiROUNDS512];
//GLOBAL __m128i grsiROUND_CONST_L7[grsiROUNDS512];
DATA_ALIGN16(int32_t grsiSUBSH_MASK_short[8*4]) = {
0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
0x04030201, 0x08070605, 0x0c0b0a09, 0x000f0e0d,
0x05040302, 0x09080706, 0x0d0c0b0a, 0x01000f0e,
0x06050403, 0x0a090807, 0x0e0d0c0b, 0x0201000f,
0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100,
0x08070605, 0x0c0b0a09, 0x000f0e0d, 0x04030201,
0x09080706, 0x0d0c0b0a, 0x01000f0e, 0x05040302,
0x0e0d0c0b, 0x0201000f, 0x06050403, 0x0a090807
};
GLOBAL __m128i *grsiSUBSH_MASK = grsiSUBSH_MASK_short;
GLOBAL __m128i grsiALL_0F = {0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f};
GLOBAL __m128i grsiALL_1B = {0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b};
GLOBAL __m128i grsiALL_FF = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
/* global unsknown */
GLOBAL __m128i grsiVPERM_OPT[2];
GLOBAL __m128i grsiVPERM_INV[2];
GLOBAL __m128i grsiVPERM_SB1[2];
GLOBAL __m128i grsiVPERM_SB2[2];
GLOBAL __m128i grsiVPERM_SB4[2];
GLOBAL __m128i grsiVPERM_SBO[2];
/* state vars */
GLOBAL __m128i grsiTRANSP_MASK;
GLOBAL __m128i grsiVPERM_IPT[2];
GLOBAL __m128i grsiALL_15;
GLOBAL __m128i grsiALL_63;
GLOBAL __m128i grsiROUND_CONST_P[grsiROUNDS1024];
GLOBAL __m128i grsiROUND_CONST_Q[grsiROUNDS1024];
#define grsitos(a) #a
#define grsitostr(a) grsitos(a)
/*
grsiALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
grsiALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
*/
#define grsiSET_SHARED_CONSTANTS(){\
grsiTRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
grsiALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
grsiALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
\
grsiVPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
grsiVPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
grsiVPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
grsiVPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
grsiVPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
grsiVPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
grsiVPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
grsiVPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
grsiVPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
grsiVPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
grsiVPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
grsiVPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
}/**/
/* grsiVPERM
* Transform w/o settings c*
* transforms 2 rows to/from "vperm mode"
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0, a1 = 2 rows
* table = transformation table to use
* t*, c* = clobbers
* outputs:
* a0, a1 = 2 rows transformed with table
* */
#define grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
t0 = c0;\
t1 = c0;\
t0 = _mm_andnot_si128(t0, a0);\
t1 = _mm_andnot_si128(t1, a1);\
t0 = _mm_srli_epi32(t0, 4);\
t1 = _mm_srli_epi32(t1, 4);\
a0 = _mm_and_si128(a0, c0);\
a1 = _mm_and_si128(a1, c0);\
t2 = c2;\
t3 = c2;\
t2 = _mm_shuffle_epi8(t2, a0);\
t3 = _mm_shuffle_epi8(t3, a1);\
a0 = c1;\
a1 = c1;\
a0 = _mm_shuffle_epi8(a0, t0);\
a1 = _mm_shuffle_epi8(a1, t1);\
a0 = _mm_xor_si128(a0, t2);\
a1 = _mm_xor_si128(a1, t3);\
}/**/
#define grsiVPERM_Transform_Set_Const(table, c0, c1, c2){\
c0 = grsiALL_0F;\
c1 = ((__m128i*) table )[0];\
c2 = ((__m128i*) table )[1];\
}/**/
/* grsiVPERM
* Transform
* transforms 2 rows to/from "vperm mode"
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0, a1 = 2 rows
* table = transformation table to use
* t*, c* = clobbers
* outputs:
* a0, a1 = 2 rows transformed with table
* */
#define grsiVPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\
grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
}/**/
/* grsiVPERM
* Transform State
* inputs:
* a0-a3 = state
* table = transformation table to use
* t* = clobbers
* outputs:
* a0-a3 = transformed state
* */
#define grsiVPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\
grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
grsiVPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
}/**/
/* grsiVPERM
* Add Constant to State
* inputs:
* a0-a7 = state
* constant = constant to add
* t0 = clobber
* outputs:
* a0-a7 = state + constant
* */
#define grsiVPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
t0 = constant;\
a0 = _mm_xor_si128(a0, t0);\
a1 = _mm_xor_si128(a1, t0);\
a2 = _mm_xor_si128(a2, t0);\
a3 = _mm_xor_si128(a3, t0);\
a4 = _mm_xor_si128(a4, t0);\
a5 = _mm_xor_si128(a5, t0);\
a6 = _mm_xor_si128(a6, t0);\
a7 = _mm_xor_si128(a7, t0);\
}/**/
/* grsiVPERM
* Set Substitute Core Constants
* */
#define grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2){\
grsiVPERM_Transform_Set_Const(grsiVPERM_INV, c0, c1, c2);\
}/**/
/* grsiVPERM
* Substitute Core
* first part of sbox inverse computation
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0 = 1 row
* t*, c* = clobbers
* outputs:
* b0a, b0b = inputs for lookup step
* */
#define grsiVPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
t0 = c0;\
t0 = _mm_andnot_si128(t0, a0);\
t0 = _mm_srli_epi32(t0, 4);\
a0 = _mm_and_si128(a0, c0);\
b0a = c1;\
b0a = _mm_shuffle_epi8(b0a, a0);\
a0 = _mm_xor_si128(a0, t0);\
b0b = c2;\
b0b = _mm_shuffle_epi8(b0b, t0);\
b0b = _mm_xor_si128(b0b, b0a);\
t1 = c2;\
t1 = _mm_shuffle_epi8(t1, a0);\
t1 = _mm_xor_si128(t1, b0a);\
b0a = c2;\
b0a = _mm_shuffle_epi8(b0a, b0b);\
b0a = _mm_xor_si128(b0a, a0);\
b0b = c2;\
b0b = _mm_shuffle_epi8(b0b, t1);\
b0b = _mm_xor_si128(b0b, t0);\
}/**/
/* grsiVPERM
* Lookup
* second part of sbox inverse computation
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0a, a0b = output of Substitution Core
* table = lookup table to use (*1 / *2 / *4)
* t0 = clobber
* outputs:
* b0 = output of sbox + multiplication
* */
#define grsiVPERM_Lookup(a0a, a0b, table, b0, t0){\
b0 = ((__m128i*) table )[0];\
t0 = ((__m128i*) table )[1];\
b0 = _mm_shuffle_epi8(b0, a0b);\
t0 = _mm_shuffle_epi8(t0, a0a);\
b0 = _mm_xor_si128(b0, t0);\
}/**/
/* grsiVPERM
* SubBytes and *2 / *4
* this function is derived from:
* Constant-time SSSE3 AES core implementation
* by Mike Hamburg
* and
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0-a7 = state
* t*, c* = clobbers
* outputs:
* a0-a7 = state * 4
* c2 = row0 * 2 -> b0
* c1 = row7 * 2 -> b3
* c0 = row7 * 1 -> b4
* t2 = row4 * 1 -> b7
* TEMP_MUL1 = row(i) * 1
* TEMP_MUL2 = row(i) * 2
*
* call:grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
#define grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
/* set Constants */\
grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2);\
/* row 1 */\
grsiVPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
TEMP_MUL1[1] = t2;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
TEMP_MUL2[1] = t3;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a1, t4);\
/* --- */\
/* row 2 */\
grsiVPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
TEMP_MUL1[2] = t2;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
TEMP_MUL2[2] = t3;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a2, t4);\
/* --- */\
/* row 3 */\
grsiVPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
TEMP_MUL1[3] = t2;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
TEMP_MUL2[3] = t3;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a3, t4);\
/* --- */\
/* row 5 */\
grsiVPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
TEMP_MUL1[5] = t2;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
TEMP_MUL2[5] = t3;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a5, t4);\
/* --- */\
/* row 6 */\
grsiVPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
TEMP_MUL1[6] = t2;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
TEMP_MUL2[6] = t3;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a6, t4);\
/* --- */\
/* row 7 */\
grsiVPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\
TEMP_MUL1[7] = t2;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c1, t4); /*c1 -> b3*/\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a7, t4);\
/* --- */\
/* row 4 */\
grsiVPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4); /*t2 -> b7*/\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\
TEMP_MUL2[4] = t3;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a4, t4);\
/* --- */\
/* row 0 */\
grsiVPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, c0, t4); /*c0 -> b4*/\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c2, t4); /*c2 -> b0*/\
TEMP_MUL2[0] = c2;\
grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a0, t4);\
/* --- */\
}/**/
/* Optimized grsiMixBytes
* inputs:
* a0-a7 = (row0-row7) * 4
* b0 = row0 * 2
* b3 = row7 * 2
* b4 = row7 * 1
* b7 = row4 * 1
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
* output: b0-b7
* */
#define grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* save one value */\
TEMP_MUL4 = a3;\
/* 1 */\
b1 = a0;\
b1 = _mm_xor_si128(b1, a5);\
b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
b2 = b1;\
\
/* 2 */\
b5 = a1;\
b5 = _mm_xor_si128(b5, a4);\
b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
b6 = b5;\
\
/* 4 */\
b7 = _mm_xor_si128(b7, a6);\
/*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
b2 = _mm_xor_si128(b2, b7);\
\
/* 3 */\
b0 = _mm_xor_si128(b0, a7);\
b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
/*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
b3 = b0;\
b1 = _mm_xor_si128(b1, b0);\
b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
\
/* 5 */\
b4 = _mm_xor_si128(b4, a2);\
/*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
b3 = _mm_xor_si128(b3, b4);\
b6 = _mm_xor_si128(b6, b4);\
\
/* 6 */\
a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
b4 = _mm_xor_si128(b4, a3);\
b5 = _mm_xor_si128(b5, a3);\
b7 = _mm_xor_si128(b7, a3);\
\
/* 7 */\
a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
b2 = _mm_xor_si128(b2, a1);\
b3 = _mm_xor_si128(b3, a1);\
\
/* 8 */\
a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
b6 = _mm_xor_si128(b6, a5);\
b7 = _mm_xor_si128(b7, a5);\
\
/* 9 */\
a3 = TEMP_MUL1[2];\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
b0 = _mm_xor_si128(b0, a3);\
b5 = _mm_xor_si128(b5, a3);\
\
/* 10 */\
a1 = TEMP_MUL1[6];\
a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
b1 = _mm_xor_si128(b1, a1);\
b4 = _mm_xor_si128(b4, a1);\
\
/* 11 */\
a5 = TEMP_MUL1[3];\
a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
b1 = _mm_xor_si128(b1, a5);\
b6 = _mm_xor_si128(b6, a5);\
\
/* 12 */\
a3 = TEMP_MUL1[7];\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
b2 = _mm_xor_si128(b2, a3);\
b5 = _mm_xor_si128(b5, a3);\
\
/* 13 */\
b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
b0 = _mm_xor_si128(b0, a4);\
b1 = _mm_xor_si128(b1, a4);\
b3 = _mm_xor_si128(b3, a6);\
b4 = _mm_xor_si128(b4, a0);\
b4 = _mm_xor_si128(b4, a7);\
b5 = _mm_xor_si128(b5, a0);\
b7 = _mm_xor_si128(b7, a2);\
}/**/
/*
grsiSUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\
grsiSUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\
grsiSUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\
grsiSUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\
grsiSUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\
grsiSUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\
grsiSUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\
grsiSUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\
*/
#define grsiSET_CONSTANTS(){\
grsiSET_SHARED_CONSTANTS();\
grsiALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
for(i = 0; i < grsiROUNDS1024; i++)\
{\
grsiROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
grsiROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
}\
}/**/
/* one round
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define grsiSUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes + Multiplication */\
grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
/* grsiMixBytes */\
grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
}/**/
#define grsiROUNDS_P(){\
u32 round_counter;\
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
/* AddRoundConstant P1024 */\
xmm8 = _mm_xor_si128(xmm8, (grsiROUND_CONST_P[round_counter]));\
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm_shuffle_epi8(xmm8, (grsiSUBSH_MASK[0]));\
xmm9 = _mm_shuffle_epi8(xmm9, (grsiSUBSH_MASK[1]));\
xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[2]));\
xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[3]));\
xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[4]));\
xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[5]));\
xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[6]));\
xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[7]));\
/* SubBytes + grsiMixBytes */\
grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
grsiVPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, grsiALL_15, xmm8);\
\
/* AddRoundConstant P1024 */\
xmm0 = _mm_xor_si128(xmm0, (grsiROUND_CONST_P[round_counter+1]));\
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[0]));\
xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[1]));\
xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[2]));\
xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[3]));\
xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[4]));\
xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[5]));\
xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[6]));\
xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[7]));\
/* SubBytes + grsiMixBytes */\
grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm0);\
}\
}/**/
#define grsiROUNDS_Q(){\
grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\
u32 round_counter = 0;\
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
/* AddRoundConstant Q1024 */\
xmm1 = grsiALL_FF;\
xmm8 = _mm_xor_si128(xmm8, xmm1);\
xmm9 = _mm_xor_si128(xmm9, xmm1);\
xmm10 = _mm_xor_si128(xmm10, xmm1);\
xmm11 = _mm_xor_si128(xmm11, xmm1);\
xmm12 = _mm_xor_si128(xmm12, xmm1);\
xmm13 = _mm_xor_si128(xmm13, xmm1);\
xmm14 = _mm_xor_si128(xmm14, xmm1);\
xmm15 = _mm_xor_si128(xmm15, (grsiROUND_CONST_Q[round_counter]));\
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm_shuffle_epi8(xmm8, (grsiSUBSH_MASK[1]));\
xmm9 = _mm_shuffle_epi8(xmm9, (grsiSUBSH_MASK[3]));\
xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[5]));\
xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[7]));\
xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[0]));\
xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[2]));\
xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[4]));\
xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[6]));\
/* SubBytes + grsiMixBytes */\
grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
\
/* AddRoundConstant Q1024 */\
xmm9 = grsiALL_FF;\
xmm0 = _mm_xor_si128(xmm0, xmm9);\
xmm1 = _mm_xor_si128(xmm1, xmm9);\
xmm2 = _mm_xor_si128(xmm2, xmm9);\
xmm3 = _mm_xor_si128(xmm3, xmm9);\
xmm4 = _mm_xor_si128(xmm4, xmm9);\
xmm5 = _mm_xor_si128(xmm5, xmm9);\
xmm6 = _mm_xor_si128(xmm6, xmm9);\
xmm7 = _mm_xor_si128(xmm7, (grsiROUND_CONST_Q[round_counter+1]));\
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[1]));\
xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[3]));\
xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[5]));\
xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[7]));\
xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[0]));\
xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[2]));\
xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[4]));\
xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[6]));\
/* SubBytes + grsiMixBytes*/ \
grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
}\
grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\
}/**/
/* Matrix Transpose
* input is a 1024-bit state with two columns in one xmm
* output is a 1024-bit state with two rows in one xmm
* inputs: i0-i7
* outputs: i0-i7
* clobbers: t0-t7
*/
#define grsiMatrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
t0 = grsiTRANSP_MASK;\
\
i6 = _mm_shuffle_epi8(i6, t0);\
i0 = _mm_shuffle_epi8(i0, t0);\
i1 = _mm_shuffle_epi8(i1, t0);\
i2 = _mm_shuffle_epi8(i2, t0);\
i3 = _mm_shuffle_epi8(i3, t0);\
t1 = i2;\
i4 = _mm_shuffle_epi8(i4, t0);\
i5 = _mm_shuffle_epi8(i5, t0);\
t2 = i4;\
t3 = i6;\
i7 = _mm_shuffle_epi8(i7, t0);\
\
/* continue with unpack using 4 temp registers */\
t0 = i0;\
t2 = _mm_unpackhi_epi16(t2, i5);\
i4 = _mm_unpacklo_epi16(i4, i5);\
t3 = _mm_unpackhi_epi16(t3, i7);\
i6 = _mm_unpacklo_epi16(i6, i7);\
t0 = _mm_unpackhi_epi16(t0, i1);\
t1 = _mm_unpackhi_epi16(t1, i3);\
i2 = _mm_unpacklo_epi16(i2, i3);\
i0 = _mm_unpacklo_epi16(i0, i1);\
\
/* shuffle with immediate */\
t0 = _mm_shuffle_epi32(t0, 216);\
t1 = _mm_shuffle_epi32(t1, 216);\
t2 = _mm_shuffle_epi32(t2, 216);\
t3 = _mm_shuffle_epi32(t3, 216);\
i0 = _mm_shuffle_epi32(i0, 216);\
i2 = _mm_shuffle_epi32(i2, 216);\
i4 = _mm_shuffle_epi32(i4, 216);\
i6 = _mm_shuffle_epi32(i6, 216);\
\
/* continue with unpack */\
t4 = i0;\
i0 = _mm_unpacklo_epi32(i0, i2);\
t4 = _mm_unpackhi_epi32(t4, i2);\
t5 = t0;\
t0 = _mm_unpacklo_epi32(t0, t1);\
t5 = _mm_unpackhi_epi32(t5, t1);\
t6 = i4;\
i4 = _mm_unpacklo_epi32(i4, i6);\
t7 = t2;\
t6 = _mm_unpackhi_epi32(t6, i6);\
i2 = t0;\
t2 = _mm_unpacklo_epi32(t2, t3);\
i3 = t0;\
t7 = _mm_unpackhi_epi32(t7, t3);\
\
/* there are now 2 rows in each xmm */\
/* unpack to get 1 row of CV in each xmm */\
i1 = i0;\
i1 = _mm_unpackhi_epi64(i1, i4);\
i0 = _mm_unpacklo_epi64(i0, i4);\
i4 = t4;\
i3 = _mm_unpackhi_epi64(i3, t2);\
i5 = t4;\
i2 = _mm_unpacklo_epi64(i2, t2);\
i6 = t5;\
i5 = _mm_unpackhi_epi64(i5, t6);\
i7 = t5;\
i4 = _mm_unpacklo_epi64(i4, t6);\
i7 = _mm_unpackhi_epi64(i7, t7);\
i6 = _mm_unpacklo_epi64(i6, t7);\
/* transpose done */\
}/**/
/* Matrix Transpose Inverse
* input is a 1024-bit state with two rows in one xmm
* output is a 1024-bit state with two columns in one xmm
* inputs: i0-i7
* outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
* clobbers: t0-t4
*/
#define grsiMatrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
/* transpose matrix to get output format */\
o1 = i0;\
i0 = _mm_unpacklo_epi64(i0, i1);\
o1 = _mm_unpackhi_epi64(o1, i1);\
t0 = i2;\
i2 = _mm_unpacklo_epi64(i2, i3);\
t0 = _mm_unpackhi_epi64(t0, i3);\
t1 = i4;\
i4 = _mm_unpacklo_epi64(i4, i5);\
t1 = _mm_unpackhi_epi64(t1, i5);\
t2 = i6;\
o0 = grsiTRANSP_MASK;\
i6 = _mm_unpacklo_epi64(i6, i7);\
t2 = _mm_unpackhi_epi64(t2, i7);\
/* load transpose mask into a register, because it will be used 8 times */\
i0 = _mm_shuffle_epi8(i0, o0);\
i2 = _mm_shuffle_epi8(i2, o0);\
i4 = _mm_shuffle_epi8(i4, o0);\
i6 = _mm_shuffle_epi8(i6, o0);\
o1 = _mm_shuffle_epi8(o1, o0);\
t0 = _mm_shuffle_epi8(t0, o0);\
t1 = _mm_shuffle_epi8(t1, o0);\
t2 = _mm_shuffle_epi8(t2, o0);\
/* continue with unpack using 4 temp registers */\
t3 = i4;\
o2 = o1;\
o0 = i0;\
t4 = t1;\
\
t3 = _mm_unpackhi_epi16(t3, i6);\
i4 = _mm_unpacklo_epi16(i4, i6);\
o0 = _mm_unpackhi_epi16(o0, i2);\
i0 = _mm_unpacklo_epi16(i0, i2);\
o2 = _mm_unpackhi_epi16(o2, t0);\
o1 = _mm_unpacklo_epi16(o1, t0);\
t4 = _mm_unpackhi_epi16(t4, t2);\
t1 = _mm_unpacklo_epi16(t1, t2);\
/* shuffle with immediate */\
i4 = _mm_shuffle_epi32(i4, 216);\
t3 = _mm_shuffle_epi32(t3, 216);\
o1 = _mm_shuffle_epi32(o1, 216);\
o2 = _mm_shuffle_epi32(o2, 216);\
i0 = _mm_shuffle_epi32(i0, 216);\
o0 = _mm_shuffle_epi32(o0, 216);\
t1 = _mm_shuffle_epi32(t1, 216);\
t4 = _mm_shuffle_epi32(t4, 216);\
/* continue with unpack */\
i1 = i0;\
i3 = o0;\
i5 = o1;\
i7 = o2;\
i0 = _mm_unpacklo_epi32(i0, i4);\
i1 = _mm_unpackhi_epi32(i1, i4);\
o0 = _mm_unpacklo_epi32(o0, t3);\
i3 = _mm_unpackhi_epi32(i3, t3);\
o1 = _mm_unpacklo_epi32(o1, t1);\
i5 = _mm_unpackhi_epi32(i5, t1);\
o2 = _mm_unpacklo_epi32(o2, t4);\
i7 = _mm_unpackhi_epi32(i7, t4);\
/* transpose done */\
}/**/
/* transform round constants into grsiVPERM mode */
#define grsiVPERM_Transform_RoundConst_CNT2(i, j){\
xmm0 = grsiROUND_CONST_P[i];\
xmm1 = grsiROUND_CONST_P[j];\
xmm2 = grsiROUND_CONST_Q[i];\
xmm3 = grsiROUND_CONST_Q[j];\
grsiVPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
xmm2 = _mm_xor_si128(xmm2, (grsiALL_15));\
xmm3 = _mm_xor_si128(xmm3, (grsiALL_15));\
grsiROUND_CONST_P[i] = xmm0;\
grsiROUND_CONST_P[j] = xmm1;\
grsiROUND_CONST_Q[i] = xmm2;\
grsiROUND_CONST_Q[j] = xmm3;\
}/**/
/* transform round constants into grsiVPERM mode */
#define grsiVPERM_Transform_RoundConst(){\
grsiVPERM_Transform_RoundConst_CNT2(0, 1);\
grsiVPERM_Transform_RoundConst_CNT2(2, 3);\
grsiVPERM_Transform_RoundConst_CNT2(4, 5);\
grsiVPERM_Transform_RoundConst_CNT2(6, 7);\
grsiVPERM_Transform_RoundConst_CNT2(8, 9);\
grsiVPERM_Transform_RoundConst_CNT2(10, 11);\
grsiVPERM_Transform_RoundConst_CNT2(12, 13);\
xmm0 = grsiALL_FF;\
grsiVPERM_Transform(xmm0, xmm1, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
xmm0 = _mm_xor_si128(xmm0, (grsiALL_15));\
grsiALL_FF = xmm0;\
}/**/
IFUN void grsiINIT(u64* h)
#if !defined(DECLARE_IFUN)
;
#else
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
/* transform round constants into grsiVPERM mode */
grsiVPERM_Transform_RoundConst();
/* load IV into registers xmm8 - xmm15 */
xmm8 = chaining[0];
xmm9 = chaining[1];
xmm10 = chaining[2];
xmm11 = chaining[3];
xmm12 = chaining[4];
xmm13 = chaining[5];
xmm14 = chaining[6];
xmm15 = chaining[7];
/* transform chaining value from column ordering into row ordering */
grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
/* store transposed IV */
chaining[0] = xmm8;
chaining[1] = xmm9;
chaining[2] = xmm10;
chaining[3] = xmm11;
chaining[4] = xmm12;
chaining[5] = xmm13;
chaining[6] = xmm14;
chaining[7] = xmm15;
}
#endif
IFUN void grsiTF1024(u64* h, u64* m)
#if !defined(DECLARE_IFUN)
;
#else
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP_MUL1[8];
static __m128i TEMP_MUL2[8];
static __m128i TEMP_MUL4;
static __m128i QTEMP[8];
/* load message into registers xmm8 - xmm15 (Q = message) */
xmm8 = message[0];
xmm9 = message[1];
xmm10 = message[2];
xmm11 = message[3];
xmm12 = message[4];
xmm13 = message[5];
xmm14 = message[6];
xmm15 = message[7];
/* transform message M from column ordering into row ordering */
grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
/* store message M (Q input) for later */
QTEMP[0] = xmm8;
QTEMP[1] = xmm9;
QTEMP[2] = xmm10;
QTEMP[3] = xmm11;
QTEMP[4] = xmm12;
QTEMP[5] = xmm13;
QTEMP[6] = xmm14;
QTEMP[7] = xmm15;
/* xor CV to message to get P input */
/* result: CV+M in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* compute permutation P */
/* result: P(CV+M) in xmm8...xmm15 */
grsiROUNDS_P();
/* xor CV to P output (feed-forward) */
/* result: P(CV+M)+CV in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* store P(CV+M)+CV */
chaining[0] = xmm8;
chaining[1] = xmm9;
chaining[2] = xmm10;
chaining[3] = xmm11;
chaining[4] = xmm12;
chaining[5] = xmm13;
chaining[6] = xmm14;
chaining[7] = xmm15;
/* load message M (Q input) into xmm8-15 */
xmm8 = QTEMP[0];
xmm9 = QTEMP[1];
xmm10 = QTEMP[2];
xmm11 = QTEMP[3];
xmm12 = QTEMP[4];
xmm13 = QTEMP[5];
xmm14 = QTEMP[6];
xmm15 = QTEMP[7];
/* compute permutation Q */
/* result: Q(M) in xmm8...xmm15 */
grsiROUNDS_Q();
/* xor Q output */
/* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* store CV */
chaining[0] = xmm8;
chaining[1] = xmm9;
chaining[2] = xmm10;
chaining[3] = xmm11;
chaining[4] = xmm12;
chaining[5] = xmm13;
chaining[6] = xmm14;
chaining[7] = xmm15;
return;
}
#endif
IFUN void grsiOF1024(u64* h)
#if !defined(DECLARE_IFUN)
;
#else
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP_MUL1[8];
static __m128i TEMP_MUL2[8];
static __m128i TEMP_MUL4;
/* load CV into registers xmm8 - xmm15 */
xmm8 = chaining[0];
xmm9 = chaining[1];
xmm10 = chaining[2];
xmm11 = chaining[3];
xmm12 = chaining[4];
xmm13 = chaining[5];
xmm14 = chaining[6];
xmm15 = chaining[7];
/* compute permutation P */
/* result: P(CV) in xmm8...xmm15 */
grsiROUNDS_P();
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* transpose CV back from row ordering to column ordering */
/* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
grsiMatrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
grsiVPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, grsiVPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12);
/* we only need to return the truncated half of the state */
chaining[4] = xmm0;
chaining[5] = xmm6;
chaining[6] = xmm13;
chaining[7] = xmm15;
return;
}
#endif

273
algo/groestl/sse2/grsi.c Normal file
View File

@@ -0,0 +1,273 @@
/* hash.c Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#include "grsi.h"
#include "grsi-asm.h"
/* void grsiInit(grsiState* ctx) { */
#define GRS_I \
do { \
grsiState *ctx = &sts_grs; \
u8 i = 0; \
\
/* set number of state columns and state size depending on \
variant */ \
ctx->grsicolumns = grsiCOLS; \
ctx->grsistatesize = grsiSIZE; \
ctx->grsiv = LONG; \
\
grsiSET_CONSTANTS(); \
\
memset(ctx->grsichaining, 0, sizeof(u64)*grsiSIZE/8); \
memset(ctx->grsibuffer, 0, sizeof(grsiBitSequence)*grsiSIZE); \
\
if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL) \
return; \
\
/* set initial value */ \
ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH); \
\
grsiINIT(ctx->grsichaining); \
\
/* set other variables */ \
ctx->grsibuf_ptr = 0; \
ctx->grsiblock_counter = 0; \
ctx->grsibits_in_last_byte = 0; \
\
} while (0)
/* digest up to len bytes of input (full blocks only) */
void grsiTransform(grsiState *ctx,
const u8 *in,
unsigned long long len) {
/* increment block counter */
ctx->grsiblock_counter += len/grsiSIZE;
/* digest message, one block at a time */
for (; len >= grsiSIZE; len -= grsiSIZE, in += grsiSIZE)
grsiTF1024((u64*)ctx->grsichaining, (u64*)in);
asm volatile ("emms");
}
/* given state h, do h <- P(h)+h */
void grsiOutputTransformation(grsiState *ctx) {
/* determine variant */
grsiOF1024((u64*)ctx->grsichaining);
asm volatile ("emms");
}
/* initialise context */
void grsiInit(grsiState* ctx) {
u8 i = 0;
/* output size (in bits) must be a positive integer less than or
equal to 512, and divisible by 8 */
if (grsiLENGTH <= 0 || (grsiLENGTH%8) || grsiLENGTH > 512)
return;
/* set number of state columns and state size depending on
variant */
ctx->grsicolumns = grsiCOLS;
ctx->grsistatesize = grsiSIZE;
ctx->grsiv = LONG;
grsiSET_CONSTANTS();
for (i=0; i<grsiSIZE/8; i++)
ctx->grsichaining[i] = 0;
for (i=0; i<grsiSIZE; i++)
ctx->grsibuffer[i] = 0;
if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL)
return;
/* set initial value */
ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH);
grsiINIT(ctx->grsichaining);
/* set other variables */
ctx->grsibuf_ptr = 0;
ctx->grsiblock_counter = 0;
ctx->grsibits_in_last_byte = 0;
return;
}
/* update state with databitlen bits of input */
void grsiUpdate(grsiState* ctx,
const grsiBitSequence* input,
grsiDataLength databitlen) {
int index = 0;
int msglen = (int)(databitlen/8);
int rem = (int)(databitlen%8);
/* non-integral number of message bytes can only be supplied in the
last call to this function */
if (ctx->grsibits_in_last_byte) return;
/* if the buffer contains data that has not yet been digested, first
add data to buffer until full */
if (ctx->grsibuf_ptr) {
while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) {
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
}
if (ctx->grsibuf_ptr < ctx->grsistatesize) {
/* buffer still not full, return */
if (rem) {
ctx->grsibits_in_last_byte = rem;
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
}
return;
}
/* digest buffer */
ctx->grsibuf_ptr = 0;
printf("error\n");
grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
}
/* digest bulk of message */
grsiTransform(ctx, input+index, msglen-index);
index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize;
/* store remaining data in buffer */
while (index < msglen) {
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
}
/* if non-integral number of bytes have been supplied, store
remaining bits in last byte, together with information about
number of bits */
if (rem) {
ctx->grsibits_in_last_byte = rem;
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
}
return;
}
/* update state with databitlen bits of input */
void grsiUpdateq(grsiState* ctx, const grsiBitSequence* input)
{
grsiDataLength databitlen= 64*8;
int index = 0;
int msglen = (int)(databitlen/8);
int rem = (int)(databitlen%8);
/* non-integral number of message bytes can only be supplied in the
last call to this function */
if (ctx->grsibits_in_last_byte) return;
/* if the buffer contains data that has not yet been digested, first
add data to buffer until full */
if (ctx->grsibuf_ptr) {
while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) {
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
}
if (ctx->grsibuf_ptr < ctx->grsistatesize) {
/* buffer still not full, return */
if (rem) {
ctx->grsibits_in_last_byte = rem;
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
}
return;
}
/* digest buffer */
ctx->grsibuf_ptr = 0;
printf("error\n");
grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
}
/* digest bulk of message */
grsiTransform(ctx, input+index, msglen-index);
index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize;
/* store remaining data in buffer */
while (index < msglen) {
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++];
}
/* if non-integral number of bytes have been supplied, store
remaining bits in last byte, together with information about
number of bits */
if (rem) {
ctx->grsibits_in_last_byte = rem;
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index];
}
return;
}
#define BILB ctx->grsibits_in_last_byte
/* finalise: process remaining data (including padding), perform
output transformation, and write hash result to 'output' */
void grsiFinal(grsiState* ctx,
grsiBitSequence* output) {
int i, j = 0, grsibytelen = grsiLENGTH/8;
u8 *s = (grsiBitSequence*)ctx->grsichaining;
/* pad with '1'-bit and first few '0'-bits */
if (BILB) {
ctx->grsibuffer[(int)ctx->grsibuf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
ctx->grsibuffer[(int)ctx->grsibuf_ptr-1] ^= 0x1<<(7-BILB);
BILB = 0;
}
else ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0x80;
/* pad with '0'-bits */
if (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) {
/* padding requires two blocks */
while (ctx->grsibuf_ptr < ctx->grsistatesize) {
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0;
}
/* digest first padding block */
grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
ctx->grsibuf_ptr = 0;
}
while (ctx->grsibuf_ptr < ctx->grsistatesize-grsiLENGTHFIELDLEN) {
ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0;
}
/* length padding */
ctx->grsiblock_counter++;
ctx->grsibuf_ptr = ctx->grsistatesize;
while (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) {
ctx->grsibuffer[(int)--ctx->grsibuf_ptr] = (u8)ctx->grsiblock_counter;
ctx->grsiblock_counter >>= 8;
}
/* digest final padding block */
grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize);
/* perform output transformation */
grsiOutputTransformation(ctx);
/* store hash result in output */
for (i = ctx->grsistatesize-grsibytelen; i < ctx->grsistatesize; i++,j++) {
output[j] = s[i];
}
/* zeroise relevant variables and deallocate memory */
for (i = 0; i < ctx->grsicolumns; i++) {
ctx->grsichaining[i] = 0;
}
for (i = 0; i < ctx->grsistatesize; i++) {
ctx->grsibuffer[i] = 0;
}
// free(ctx->grsichaining);
// free(ctx->grsibuffer);
return;
}

79
algo/groestl/sse2/grsi.h Normal file
View File

@@ -0,0 +1,79 @@
/* hash.h Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#ifndef __grsi_h
#define __grsi_h
#include <stdio.h>
#include <stdlib.h>
#include "brg_endian.h"
#define NEED_UINT_64T
#include "brg_types.h"
#define grsiLENGTH 512
/* some sizes (number of bytes) */
#define grsiROWS 8
#define grsiLENGTHFIELDLEN grsiROWS
#define grsiCOLS512 8
#define grsiCOLS1024 16
#define grsiSIZE512 (grsiROWS*grsiCOLS512)
#define grsiSIZE1024 (grsiROWS*grsiCOLS1024)
#define grsiROUNDS512 10
#define grsiROUNDS1024 14
#if grsiLENGTH<=256
#define grsiCOLS grsiCOLS512
#define grsiSIZE grsiSIZE512
#define grsiROUNDS grsiROUNDS512
#else
#define grsiCOLS grsiCOLS1024
#define grsiSIZE grsiSIZE1024
#define grsiROUNDS grsiROUNDS1024
#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
#define grsiU64BIG(a) (a)
#endif /* IS_BIG_ENDIAN */
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
#define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
#define grsiU64BIG(a) \
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
(ROTL64(a,56) & li_64(FF000000FF000000)))
#endif /* IS_LITTLE_ENDIAN */
typedef enum { LONG, SHORT } grsiVar;
/* NIST API begin */
typedef unsigned char grsiBitSequence;
typedef unsigned long long grsiDataLength;
typedef struct {
__attribute__ ((aligned (32))) u64 grsichaining[grsiSIZE/8]; /* actual state */
__attribute__ ((aligned (32))) grsiBitSequence grsibuffer[grsiSIZE]; /* data buffer */
u64 grsiblock_counter; /* message block counter */
int grsibuf_ptr; /* data buffer pointer */
int grsibits_in_last_byte; /* no. of message bits in last byte of
data buffer */
int grsicolumns; /* no. of columns in state */
int grsistatesize; /* total no. of bytes in state */
grsiVar grsiv; /* LONG or SHORT */
} grsiState;
void grsiInit(grsiState*);
void grsiUpdate(grsiState*, const grsiBitSequence*, grsiDataLength);
void grsiFinal(grsiState*, grsiBitSequence*);
/* NIST API end */
#endif /* __hash_h */

1044
algo/groestl/sse2/grsn-asm.h Normal file

File diff suppressed because it is too large Load Diff

247
algo/groestl/sse2/grsn.c Normal file
View File

@@ -0,0 +1,247 @@
/* hash.c Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#include "grsn-asm.h"
/* digest up to len bytes of input (full blocks only) */
void grsnTransform(grsnState *ctx,
const u8 *in,
unsigned long long len) {
/* increment block counter */
ctx->block_counter += len/grsnSIZE;
/* digest message, one block at a time */
for (; len >= grsnSIZE; len -= grsnSIZE, in += grsnSIZE)
#if grsnLENGTH<=256
TF512((u64*)ctx->chaining, (u64*)in);
#else
TF1024((u64*)ctx->chaining, (u64*)in);
#endif
asm volatile ("emms");
}
/* given state h, do h <- P(h)+h */
void grsnOutputTransformation(grsnState *ctx) {
/* determine variant */
#if (grsnLENGTH <= 256)
OF512((u64*)ctx->chaining);
#else
OF1024((u64*)ctx->chaining);
#endif
asm volatile ("emms");
}
/* initialise context */
void grsnInit(grsnState* ctx) {
u8 i = 0;
/* output size (in bits) must be a positive integer less than or
equal to 512, and divisible by 8 */
if (grsnLENGTH <= 0 || (grsnLENGTH%8) || grsnLENGTH > 512)
return;
/* set number of state columns and state size depending on
variant */
ctx->columns = grsnCOLS;
ctx->statesize = grsnSIZE;
#if (grsnLENGTH <= 256)
ctx->v = SHORT;
#else
ctx->v = LONG;
#endif
SET_CONSTANTS();
for (i=0; i<grsnSIZE/8; i++)
ctx->chaining[i] = 0;
for (i=0; i<grsnSIZE; i++)
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return;
/* set initial value */
ctx->chaining[ctx->columns-1] = U64BIG((u64)grsnLENGTH);
INIT(ctx->chaining);
/* set other variables */
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->bits_in_last_byte = 0;
return;
}
/* update state with databitlen bits of input */
void grsnUpdate(grsnState* ctx,
const BitSequence* input,
DataLength databitlen) {
int index = 0;
int msglen = (int)(databitlen/8);
int rem = (int)(databitlen%8);
/* non-integral number of message bytes can only be supplied in the
last call to this function */
if (ctx->bits_in_last_byte) return;
/* if the buffer contains data that has not yet been digested, first
add data to buffer until full */
if (ctx->buf_ptr) {
while (ctx->buf_ptr < ctx->statesize && index < msglen) {
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
if (ctx->buf_ptr < ctx->statesize) {
/* buffer still not full, return */
if (rem) {
ctx->bits_in_last_byte = rem;
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
}
return;
}
/* digest buffer */
ctx->buf_ptr = 0;
printf("error\n");
grsnTransform(ctx, ctx->buffer, ctx->statesize);
}
/* digest bulk of message */
grsnTransform(ctx, input+index, msglen-index);
index += ((msglen-index)/ctx->statesize)*ctx->statesize;
/* store remaining data in buffer */
while (index < msglen) {
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
/* if non-integral number of bytes have been supplied, store
remaining bits in last byte, together with information about
number of bits */
if (rem) {
ctx->bits_in_last_byte = rem;
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
}
return;
}
/* update state with databitlen bits of input */
void grsnUpdateq(grsnState* ctx, const BitSequence* input)
{
int index = 0;
int msglen = (int)((64*8)/8);
int rem = (int)((64*8)%8);
/* if the buffer contains data that has not yet been digested, first
add data to buffer until full */
if (ctx->buf_ptr) {
while (ctx->buf_ptr < ctx->statesize && index < msglen) {
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
if (ctx->buf_ptr < ctx->statesize) {
/* buffer still not full, return */
if (rem) {
ctx->bits_in_last_byte = rem;
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
}
return;
}
/* digest buffer */
ctx->buf_ptr = 0;
printf("error\n");
grsnTransform(ctx, ctx->buffer, ctx->statesize);
}
/* digest bulk of message */
grsnTransform(ctx, input+index, msglen-index);
index += ((msglen-index)/ctx->statesize)*ctx->statesize;
/* store remaining data in buffer */
while (index < msglen) {
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
/* if non-integral number of bytes have been supplied, store
remaining bits in last byte, together with information about
number of bits */
if (rem) {
ctx->bits_in_last_byte = rem;
ctx->buffer[(int)ctx->buf_ptr++] = input[index];
}
return;
}
#define BILB ctx->bits_in_last_byte
/* finalise: process remaining data (including padding), perform
output transformation, and write hash result to 'output' */
void grsnFinal(grsnState* ctx,
BitSequence* output) {
int i, j = 0, grsnbytelen = grsnLENGTH/8;
u8 *s = (BitSequence*)ctx->chaining;
/* pad with '1'-bit and first few '0'-bits */
if (BILB) {
ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
BILB = 0;
}
else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
/* pad with '0'-bits */
if (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) {
/* padding requires two blocks */
while (ctx->buf_ptr < ctx->statesize) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
/* digest first padding block */
grsnTransform(ctx, ctx->buffer, ctx->statesize);
ctx->buf_ptr = 0;
}
while (ctx->buf_ptr < ctx->statesize-grsnLENGTHFIELDLEN) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
/* length padding */
ctx->block_counter++;
ctx->buf_ptr = ctx->statesize;
while (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) {
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
ctx->block_counter >>= 8;
}
/* digest final padding block */
grsnTransform(ctx, ctx->buffer, ctx->statesize);
/* perform output transformation */
grsnOutputTransformation(ctx);
/* store hash result in output */
for (i = ctx->statesize-grsnbytelen; i < ctx->statesize; i++,j++) {
output[j] = s[i];
}
/* zeroise relevant variables and deallocate memory */
for (i = 0; i < ctx->columns; i++) {
ctx->chaining[i] = 0;
}
for (i = 0; i < ctx->statesize; i++) {
ctx->buffer[i] = 0;
}
// free(ctx->chaining);
// free(ctx->buffer);
return;
}

80
algo/groestl/sse2/grsn.h Normal file
View File

@@ -0,0 +1,80 @@
/* hash.h Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#ifndef __grsn_h
#define __grsn_h
#include <stdio.h>
#include <stdlib.h>
#include "brg_endian.h"
#define NEED_UINT_64T
#include "brg_types.h"
#ifndef grsnLENGTH
#define grsnLENGTH 512
#endif
/* some sizes (number of bytes) */
#define grsnROWS 8
#define grsnLENGTHFIELDLEN grsnROWS
#define grsnCOLS512 8
#define grsnCOLS1024 16
#define grsnSIZE512 (grsnROWS*grsnCOLS512)
#define grsnSIZE1024 (grsnROWS*grsnCOLS1024)
#define grsnROUNDS512 10
#define grsnROUNDS1024 14
#if grsnLENGTH<=256
#define grsnCOLS grsnCOLS512
#define grsnSIZE grsnSIZE512
#define grsnROUNDS grsnROUNDS512
#else
#define grsnCOLS grsnCOLS1024
#define grsnSIZE grsnSIZE1024
#define grsnROUNDS grsnROUNDS1024
#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
#define U64BIG(a) (a)
#endif /* IS_BIG_ENDIAN */
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
#define U64BIG(a) \
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
(ROTL64(a,56) & li_64(FF000000FF000000)))
#endif /* IS_LITTLE_ENDIAN */
typedef enum { LONG, SHORT } Var;
/* NIST API begin */
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef struct {
__attribute__ ((aligned (32))) u64 chaining[grsnSIZE/8]; /* actual state */
__attribute__ ((aligned (32))) BitSequence buffer[grsnSIZE]; /* data buffer */
u64 block_counter; /* message block counter */
int buf_ptr; /* data buffer pointer */
int bits_in_last_byte; /* no. of message bits in last byte of
data buffer */
int columns; /* no. of columns in state */
int statesize; /* total no. of bytes in state */
Var v; /* LONG or SHORT */
} grsnState;
void grsnInit(grsnState*);
void grsnUpdate(grsnState*, const BitSequence*, DataLength);
void grsnFinal(grsnState*, BitSequence*);
#endif /* __hash_h */

1063
algo/groestl/sse2/grso-asm.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,10 @@
#ifndef GRSOASM_H
#define GRSOASM_H
#include "grso.h"
void grsoP1024ASM (u64 *x) ;
void grsoQ1024ASM (u64 *x) ;
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
#ifndef GRSOASM_H
#define GRSOASM_H
/* really same as the mmx asm.h */
/* made just in case something must be changed */
#include "grso.h"
void grsoP1024ASM (u64 *x) ;
void grsoQ1024ASM (u64 *x) ;
#endif

View File

@@ -0,0 +1,110 @@
/* hash.c January 2011
*
* Groestl-512 implementation with inline assembly containing mmx and
* sse instructions. Optimized for Opteron.
* Authors: Krystian Matusiewicz and Soeren S. Thomsen
*
* This code is placed in the public domain
*/
//#include "grso.h"
//#include "grso-asm.h"
// #include "grsotab.h"
#define DECL_GRS
/* load initial constants */
#define GRS_I \
do { \
int i; \
/* set initial value */ \
for (i = 0; i < grsoCOLS-1; i++) sts_grs.grsstate[i] = 0; \
sts_grs.grsstate[grsoCOLS-1] = grsoU64BIG((u64)(8*grsoDIGESTSIZE)); \
\
/* set other variables */ \
sts_grs.grsbuf_ptr = 0; \
sts_grs.grsblock_counter = 0; \
} while (0); \
/* load hash */
#define GRS_U \
do { \
unsigned char* in = hash; \
unsigned long long index = 0; \
\
/* if the buffer contains data that has not yet been digested, first \
add data to buffer until full */ \
if (sts_grs.grsbuf_ptr) { \
while (sts_grs.grsbuf_ptr < grsoSIZE && index < 64) { \
hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
} \
if (sts_grs.grsbuf_ptr < grsoSIZE) continue; \
\
/* digest buffer */ \
sts_grs.grsbuf_ptr = 0; \
grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
} \
\
/* digest bulk of message */ \
grsoTransform(&sts_grs, in+index, 64-index); \
index += ((64-index)/grsoSIZE)*grsoSIZE; \
\
/* store remaining data in buffer */ \
while (index < 64) { \
hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \
} \
\
} while (0);
/* groestl512 hash loaded */
/* hash = groestl512(loaded) */
#define GRS_C \
do { \
char *out = hash; \
int i, j = 0; \
unsigned char *s = (unsigned char*)sts_grs.grsstate; \
\
hashbuf[sts_grs.grsbuf_ptr++] = 0x80; \
\
/* pad with '0'-bits */ \
if (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
/* padding requires two blocks */ \
while (sts_grs.grsbuf_ptr < grsoSIZE) { \
hashbuf[sts_grs.grsbuf_ptr++] = 0; \
} \
/* digest first padding block */ \
grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
sts_grs.grsbuf_ptr = 0; \
} \
while (sts_grs.grsbuf_ptr < grsoSIZE-grsoLENGTHFIELDLEN) { \
hashbuf[sts_grs.grsbuf_ptr++] = 0; \
} \
\
/* length padding */ \
sts_grs.grsblock_counter++; \
sts_grs.grsbuf_ptr = grsoSIZE; \
while (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \
hashbuf[--sts_grs.grsbuf_ptr] = (unsigned char)sts_grs.grsblock_counter; \
sts_grs.grsblock_counter >>= 8; \
} \
\
/* digest final padding block */ \
grsoTransform(&sts_grs, hashbuf, grsoSIZE); \
/* perform output transformation */ \
grsoOutputTransformation(&sts_grs); \
\
/* store hash result in output */ \
for (i = grsoSIZE-grsoDIGESTSIZE; i < grsoSIZE; i++,j++) { \
out[j] = s[i]; \
} \
\
/* zeroise relevant variables and deallocate memory */ \
for (i = 0; i < grsoCOLS; i++) { \
sts_grs.grsstate[i] = 0; \
} \
for (i = 0; i < grsoSIZE; i++) { \
hashbuf[i] = 0; \
} \
} while (0);

57
algo/groestl/sse2/grso.c Normal file
View File

@@ -0,0 +1,57 @@
/* hash.c January 2011
*
* Groestl-512 implementation with inline assembly containing mmx and
* sse instructions. Optimized for Opteron.
* Authors: Krystian Matusiewicz and Soeren S. Thomsen
*
* This code is placed in the public domain
*/
#include "algo/groestl/sse2/grso-asm.h"
#include "algo/groestl/sse2/grso.h"
#include "algo/groestl/sse2/grsotab.h"
/* digest up to len bytes of input (full blocks only) */
void grsoTransform(grsoState *ctx,
const unsigned char *in,
unsigned long long len) {
u64 y[grsoCOLS+2] __attribute__ ((aligned (16)));
u64 z[grsoCOLS+2] __attribute__ ((aligned (16)));
u64 *m, *h = (u64*)ctx->grsstate;
int i;
/* increment block counter */
ctx->grsblock_counter += len/grsoSIZE;
/* digest message, one block at a time */
for (; len >= grsoSIZE; len -= grsoSIZE, in += grsoSIZE) {
m = (u64*)in;
for (i = 0; i < grsoCOLS; i++) {
y[i] = m[i];
z[i] = m[i] ^ h[i];
}
grsoQ1024ASM(y);
grsoP1024ASM(z);
/* h' == h + Q(m) + P(h+m) */
for (i = 0; i < grsoCOLS; i++) {
h[i] ^= z[i] ^ y[i];
}
}
}
/* given state h, do h <- P(h)+h */
void grsoOutputTransformation(grsoState *ctx) {
u64 z[grsoCOLS] __attribute__ ((aligned (16)));
int j;
for (j = 0; j < grsoCOLS; j++) {
z[j] = ctx->grsstate[j];
}
grsoP1024ASM(z);
for (j = 0; j < grsoCOLS; j++) {
ctx->grsstate[j] ^= z[j];
}
}

62
algo/groestl/sse2/grso.h Normal file
View File

@@ -0,0 +1,62 @@
#ifndef __hash_h
#define __hash_h
#include <stdio.h>
#include <stdlib.h>
#include "brg_endian.h"
#include "brg_types.h"
/* some sizes (number of bytes) */
#define grsoROWS 8
#define grsoLENGTHFIELDLEN grsoROWS
#define grsoCOLS 16
#define grsoSIZE (grsoROWS*grsoCOLS)
#define grsoDIGESTSIZE 64
#define grsoROUNDS 14
#define grsoROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&((u64)0xffffffffffffffffULL))
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#error
#endif /* IS_BIG_ENDIAN */
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
#define grsoU64BIG(a) \
((grsoROTL64(a, 8) & ((u64)0x000000ff000000ffULL)) | \
(grsoROTL64(a,24) & ((u64)0x0000ff000000ff00ULL)) | \
(grsoROTL64(a,40) & ((u64)0x00ff000000ff0000ULL)) | \
(grsoROTL64(a,56) & ((u64)0xff000000ff000000ULL)))
#endif /* IS_LITTLE_ENDIAN */
typedef struct {
u64 grsstate[grsoCOLS]; /* actual state */
u64 grsblock_counter; /* message block counter */
int grsbuf_ptr; /* data buffer pointer */
} grsoState;
//extern int grsoInit(grsoState* ctx);
//extern int grsoUpdate(grsoState* ctx, const unsigned char* in,
// unsigned long long len);
//extern int grsoUpdateq(grsoState* ctx, const unsigned char* in);
//extern int grsoFinal(grsoState* ctx,
// unsigned char* out);
//
//extern int grsohash(unsigned char *out,
// const unsigned char *in,
// unsigned long long len);
/* digest up to len bytes of input (full blocks only) */
void grsoTransform( grsoState *ctx, const unsigned char *in,
unsigned long long len );
/* given state h, do h <- P(h)+h */
void grsoOutputTransformation( grsoState *ctx );
int grso_init ( grsoState* sts_grs );
int grso_update ( grsoState* sts_grs, char* hashbuf, char* hash );
int grso_close ( grsoState *sts_grs, char* hashbuf, char* hash );
#endif /* __hash_h */

File diff suppressed because one or more lines are too long

1263
algo/groestl/sse2/grss.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,45 @@
/*
* file : hash_api.h
* version : 1.0.208
* date : 14.12.2010
*
* Grostl multi-stream bitsliced implementation Hash API
*
* Cagdas Calik
* ccalik@metu.edu.tr
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#ifndef GRSS_API_H
#define GRSS_API_H
#include "sha3_common.h"
#include <tmmintrin.h>
typedef struct
{
__m128i state1[8];
__m128i state2[8];
__m128i state3[8];
__m128i state4[8];
__m128i _Pconst[14][8];
__m128i _Qconst[14][8];
__m128i _shiftconst[8];
unsigned int uHashLength;
unsigned int uBlockLength;
BitSequence buffer[128];
} grssState;
void grssInit(grssState *state, int grssbitlen);
void grssUpdate(grssState *state, const BitSequence *data, DataLength databitlen);
void grssFinal(grssState *state, BitSequence *grssval);
#endif // HASH_API_H

File diff suppressed because one or more lines are too long

1381
algo/groestl/sse2/grsv-asm.h Normal file

File diff suppressed because it is too large Load Diff

202
algo/groestl/sse2/grsv.c Normal file
View File

@@ -0,0 +1,202 @@
/* hash.c Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#include "grsv.h"
#include "grsv-asm.h"
/* digest up to len bytes of input (full blocks only) */
void grsvTransform(grsvState *ctx,
const u8 *in,
unsigned long long len) {
/* increment block counter */
ctx->grsvblock_counter += len/grsvSIZE;
/* digest message, one block at a time */
for (; len >= grsvSIZE; len -= grsvSIZE, in += grsvSIZE)
#if grsvLENGTH<=256
grsvTF512((u64*)ctx->grsvchaining, (u64*)in);
#else
grsvTF1024((u64*)ctx->grsvchaining, (u64*)in);
#endif
asm volatile ("emms");
}
/* given state h, do h <- P(h)+h */
void grsvOutputTransformation(grsvState *ctx) {
/* determine variant */
#if (grsvLENGTH <= 256)
grsvOF512((u64*)ctx->grsvchaining);
#else
grsvOF1024((u64*)ctx->grsvchaining);
#endif
asm volatile ("emms");
}
/* initialise context */
void grsvInit(grsvState* ctx) {
u8 i = 0;
/* output size (in bits) must be a positive integer less than or
equal to 512, and divisible by 8 */
if (grsvLENGTH <= 0 || (grsvLENGTH%8) || grsvLENGTH > 512)
return;
/* set number of state columns and state size depending on
variant */
ctx->grsvcolumns = grsvCOLS;
ctx->grsvstatesize = grsvSIZE;
#if (grsvLENGTH <= 256)
ctx->grsvv = SHORT;
#else
ctx->grsvv = LONG;
#endif
SET_CONSTANTS();
for (i=0; i<grsvSIZE/8; i++)
ctx->grsvchaining[i] = 0;
for (i=0; i<grsvSIZE; i++)
ctx->grsvbuffer[i] = 0;
if (ctx->grsvchaining == NULL || ctx->grsvbuffer == NULL)
return;
/* set initial value */
ctx->grsvchaining[ctx->grsvcolumns-1] = U64BIG((u64)grsvLENGTH);
grsvINIT(ctx->grsvchaining);
/* set other variables */
ctx->grsvbuf_ptr = 0;
ctx->grsvblock_counter = 0;
ctx->grsvbits_in_last_byte = 0;
return;
}
/* update state with databitlen bits of input */
void grsvUpdate(grsvState* ctx,
const grsvBitSequence* input,
grsvDataLength databitlen) {
int index = 0;
int msglen = (int)(databitlen/8);
int rem = (int)(databitlen%8);
/* non-integral number of message bytes can only be supplied in the
last call to this function */
if (ctx->grsvbits_in_last_byte) return;
/* if the buffer contains data that has not yet been digested, first
add data to buffer until full */
if (ctx->grsvbuf_ptr) {
while (ctx->grsvbuf_ptr < ctx->grsvstatesize && index < msglen) {
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++];
}
if (ctx->grsvbuf_ptr < ctx->grsvstatesize) {
/* buffer still not full, return */
if (rem) {
ctx->grsvbits_in_last_byte = rem;
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index];
}
return;
}
/* digest buffer */
ctx->grsvbuf_ptr = 0;
printf("error\n");
grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
}
/* digest bulk of message */
grsvTransform(ctx, input+index, msglen-index);
index += ((msglen-index)/ctx->grsvstatesize)*ctx->grsvstatesize;
/* store remaining data in buffer */
while (index < msglen) {
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++];
}
/* if non-integral number of bytes have been supplied, store
remaining bits in last byte, together with information about
number of bits */
if (rem) {
ctx->grsvbits_in_last_byte = rem;
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index];
}
return;
}
#define BILB ctx->grsvbits_in_last_byte
/* finalise: process remaining data (including padding), perform
output transformation, and write hash result to 'output' */
void grsvFinal(grsvState* ctx,
grsvBitSequence* output) {
int i, j = 0, grsvbytelen = grsvLENGTH/8;
u8 *s = (grsvBitSequence*)ctx->grsvchaining;
/* pad with '1'-bit and first few '0'-bits */
if (BILB) {
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr-1] ^= 0x1<<(7-BILB);
BILB = 0;
}
else ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0x80;
/* pad with '0'-bits */
if (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
/* padding requires two blocks */
while (ctx->grsvbuf_ptr < ctx->grsvstatesize) {
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0;
}
/* digest first padding block */
grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
ctx->grsvbuf_ptr = 0;
}
while (ctx->grsvbuf_ptr < ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0;
}
/* length padding */
ctx->grsvblock_counter++;
ctx->grsvbuf_ptr = ctx->grsvstatesize;
while (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) {
ctx->grsvbuffer[(int)--ctx->grsvbuf_ptr] = (u8)ctx->grsvblock_counter;
ctx->grsvblock_counter >>= 8;
}
/* digest final padding block */
grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize);
/* perform output transformation */
grsvOutputTransformation(ctx);
/* store hash result in output */
for (i = ctx->grsvstatesize-grsvbytelen; i < ctx->grsvstatesize; i++,j++) {
output[j] = s[i];
}
/* zeroise relevant variables and deallocate memory */
for (i = 0; i < ctx->grsvcolumns; i++) {
ctx->grsvchaining[i] = 0;
}
for (i = 0; i < ctx->grsvstatesize; i++) {
ctx->grsvbuffer[i] = 0;
}
// free(ctx->grsvchaining);
// free(ctx->buffer);
return;
}

77
algo/groestl/sse2/grsv.h Normal file
View File

@@ -0,0 +1,77 @@
/* hash.h Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#ifndef __grsv_h
#define __grsv_h
#include <stdio.h>
#include <stdlib.h>
#include "brg_endian.h"
#define NEED_UINT_64T
#include "brg_types.h"
#define grsvLENGTH 512
/* some sizes (number of bytes) */
#define grsvROWS 8
#define grsvLENGTHFIELDLEN grsvROWS
#define grsvCOLS512 8
#define grsvCOLS1024 16
#define grsvSIZE512 (grsvROWS*grsvCOLS512)
#define grsvSIZE1024 (grsvROWS*grsvCOLS1024)
#define grsvROUNDS512 10
#define grsvROUNDS1024 14
#if grsvLENGTH<=256
#define grsvCOLS grsvCOLS512
#define grsvSIZE grsvSIZE512
#define grsvROUNDS grsvROUNDS512
#else
#define grsvCOLS grsvCOLS1024
#define grsvSIZE grsvSIZE1024
#define grsvROUNDS grsvROUNDS1024
#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
#define U64BIG(a) (a)
#endif /* IS_BIG_ENDIAN */
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
#define U64BIG(a) \
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
(ROTL64(a,56) & li_64(FF000000FF000000)))
#endif /* IS_LITTLE_ENDIAN */
typedef enum { LONG, SHORT } grsvVar;
typedef unsigned char grsvBitSequence;
typedef unsigned long long grsvDataLength;
typedef struct {
__attribute__ ((aligned (32))) u64 grsvchaining[grsvSIZE/8]; /* actual state */
__attribute__ ((aligned (32))) grsvBitSequence grsvbuffer[grsvSIZE]; /* data buffer */
u64 grsvblock_counter; /* message block counter */
int grsvbuf_ptr; /* data buffer pointer */
int grsvbits_in_last_byte; /* no. of message bits in last byte of
data buffer */
int grsvcolumns; /* no. of columns in state */
int grsvstatesize; /* total no. of bytes in state */
grsvVar grsvv; /* LONG or SHORT */
} grsvState;
void grsvInit(grsvState*);
void grsvUpdate(grsvState*, const grsvBitSequence*, grsvDataLength);
void grsvFinal(grsvState*, grsvBitSequence*);
#endif /* __grsv_h */