Initial upload v3.4.7

This commit is contained in:
Jay D Dee
2016-09-22 13:16:18 -04:00
parent a3c8079774
commit a35039bc05
480 changed files with 211015 additions and 3 deletions

View File

View File

@@ -0,0 +1,14 @@
This package contains an implementation of the Groestl-512 hash
function optimized for the Intel AES instructions.
Authors are Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
There are no known present or future claims by a copyright holder that
the distribution of this software infringes the copyright. In
particular, the author of the software is not making such claims and
does not intend to make such claims.
Moreover, there are no known present or future claims by a patent
holder that the use of this software infringes the patent. In
particular, the author of the software is not making such claims and
does not intend to make such claims.

View File

@@ -0,0 +1,2 @@
#define CRYPTO_BYTES 64
#define CRYPTO_VERSION "2.2"

View File

@@ -0,0 +1 @@
amd64

View File

@@ -0,0 +1,133 @@
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
LICENSE TERMS
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
1. source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
2. binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation;
3. the name of the copyright holder is not used to endorse products
built using this software without specific written permission.
DISCLAIMER
This software is provided 'as is' with no explicit or implied warranties
in respect of its properties, including, but not limited to, correctness
and/or fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
*/
#ifndef _BRG_ENDIAN_H
#define _BRG_ENDIAN_H
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
/* Include files where endian defines and byteswap functions may reside */
#if defined( __sun )
# include <sys/isa_defs.h>
#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
# include <sys/endian.h>
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
# include <machine/endian.h>
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
# if !defined( __MINGW32__ ) && !defined( _AIX )
# include <endian.h>
# if !defined( __BEOS__ )
# include <byteswap.h>
# endif
# endif
#endif
/* Now attempt to set the define for platform byte order using any */
/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
/* seem to encompass most endian symbol definitions */
#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( _BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( _LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
# endif
#elif defined( __BIG_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif defined( __LITTLE_ENDIAN__ )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#endif
/* if the platform byte order could not be determined, then try to */
/* set this define using common machine defines */
#if !defined(PLATFORM_BYTE_ORDER)
#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \
defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \
defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \
defined( vax ) || defined( vms ) || defined( VMS ) || \
defined( __VMS ) || defined( _M_X64 )
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \
defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \
defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \
defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \
defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \
defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \
defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
#else
# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
#endif
#endif
#endif

View File

@@ -0,0 +1,234 @@
/*
---------------------------------------------------------------------------
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
(a few lines added by Soeren S. Thomsen, October 2008)
LICENSE TERMS
The redistribution and use of this software (with or without changes)
is allowed without the payment of fees or royalties provided that:
1. source code distributions include the above copyright notice, this
list of conditions and the following disclaimer;
2. binary distributions include the above copyright notice, this list
of conditions and the following disclaimer in their documentation;
3. the name of the copyright holder is not used to endorse products
built using this software without specific written permission.
DISCLAIMER
This software is provided 'as is' with no explicit or implied warranties
in respect of its properties, including, but not limited to, correctness
and/or fitness for purpose.
---------------------------------------------------------------------------
Issue Date: 20/12/2007
The unsigned integer types defined here are of the form uint_<nn>t where
<nn> is the length of the type; for example, the unsigned 32-bit type is
'uint_32t'. These are NOT the same as the 'C99 integer types' that are
defined in the inttypes.h and stdint.h headers since attempts to use these
types have shown that support for them is still highly variable. However,
since the latter are of the form uint<nn>_t, a regular expression search
and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
can be used to convert the types used here to the C99 standard types.
*/
#ifndef _BRG_TYPES_H
#define _BRG_TYPES_H
#if defined(__cplusplus)
extern "C" {
#endif
#include <limits.h>
#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
# include <stddef.h>
# define ptrint_t intptr_t
#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
# include <stdint.h>
# define ptrint_t intptr_t
#else
# define ptrint_t int
#endif
#ifndef BRG_UI8
# define BRG_UI8
# if UCHAR_MAX == 255u
typedef unsigned char uint_8t;
# else
# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
# endif
#endif
#ifndef BRG_UI16
# define BRG_UI16
# if USHRT_MAX == 65535u
typedef unsigned short uint_16t;
# else
# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
# endif
#endif
#ifndef BRG_UI32
# define BRG_UI32
# if UINT_MAX == 4294967295u
# define li_32(h) 0x##h##u
typedef unsigned int uint_32t;
# elif ULONG_MAX == 4294967295u
# define li_32(h) 0x##h##ul
typedef unsigned long uint_32t;
# elif defined( _CRAY )
# error This code needs 32-bit data types, which Cray machines do not provide
# else
# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
# endif
#endif
#ifndef BRG_UI64
# if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
# define BRG_UI64
# define li_64(h) 0x##h##ui64
typedef unsigned __int64 uint_64t;
# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */
# define BRG_UI64
# define li_64(h) 0x##h##ui64
typedef unsigned __int64 uint_64t;
# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned long long uint_64t;
# elif defined( __MVS__ )
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned int long long uint_64t;
# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
# if UINT_MAX == 18446744073709551615u
# define BRG_UI64
# define li_64(h) 0x##h##u
typedef unsigned int uint_64t;
# endif
# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
# if ULONG_MAX == 18446744073709551615ul
# define BRG_UI64
# define li_64(h) 0x##h##ul
typedef unsigned long uint_64t;
# endif
# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
# if ULLONG_MAX == 18446744073709551615ull
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned long long uint_64t;
# endif
# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
# if ULONG_LONG_MAX == 18446744073709551615ull
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned long long uint_64t;
# endif
# endif
#endif
#if !defined( BRG_UI64 )
# if defined( NEED_UINT_64T )
# define BRG_UI64
# define li_64(h) 0x##h##ull
typedef unsigned long long uint_64t;
/*# error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/
# endif
#endif
#ifndef RETURN_VALUES
# define RETURN_VALUES
# if defined( DLL_EXPORT )
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
# define VOID_RETURN __declspec( dllexport ) void __stdcall
# define INT_RETURN __declspec( dllexport ) int __stdcall
# elif defined( __GNUC__ )
# define VOID_RETURN __declspec( __dllexport__ ) void
# define INT_RETURN __declspec( __dllexport__ ) int
# else
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
# endif
# elif defined( DLL_IMPORT )
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
# define VOID_RETURN __declspec( dllimport ) void __stdcall
# define INT_RETURN __declspec( dllimport ) int __stdcall
# elif defined( __GNUC__ )
# define VOID_RETURN __declspec( __dllimport__ ) void
# define INT_RETURN __declspec( __dllimport__ ) int
# else
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
# endif
# elif defined( __WATCOMC__ )
# define VOID_RETURN void __cdecl
# define INT_RETURN int __cdecl
# else
# define VOID_RETURN void
# define INT_RETURN int
# endif
#endif
/* These defines are used to detect and set the memory alignment of pointers.
Note that offsets are in bytes.
ALIGN_OFFSET(x,n) return the positive or zero offset of
the memory addressed by the pointer 'x'
from an address that is aligned on an
'n' byte boundary ('n' is a power of 2)
ALIGN_FLOOR(x,n) return a pointer that points to memory
that is aligned on an 'n' byte boundary
and is not higher than the memory address
pointed to by 'x' ('n' is a power of 2)
ALIGN_CEIL(x,n) return a pointer that points to memory
that is aligned on an 'n' byte boundary
and is not lower than the memory address
pointed to by 'x' ('n' is a power of 2)
*/
#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1))
#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
/* These defines are used to declare buffers in a way that allows
faster operations on longer variables to be used. In all these
defines 'size' must be a power of 2 and >= 8. NOTE that the
buffer size is in bytes but the type length is in bits
UNIT_TYPEDEF(x,size) declares a variable 'x' of length
'size' bits
BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize'
bytes defined as an array of variables
each of 'size' bits (bsize must be a
multiple of size / 8)
UNIT_CAST(x,size) casts a variable to a type of
length 'size' bits
UPTR_CAST(x,size) casts a pointer to a pointer to a
varaiable of length 'size' bits
*/
#define UI_TYPE(size) uint_##size##t
#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x
#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)]
#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x))
#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x))
/* Added by Soeren S. Thomsen (begin) */
#define u8 uint_8t
#define u32 uint_32t
#define u64 uint_64t
/* (end) */
#if defined(__cplusplus)
}
#endif
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,965 @@
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
* instructions.
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
*
* This code is placed in the public domain
*/
#include <smmintrin.h>
#include <wmmintrin.h>
#include "hash-groestl.h"
/* global constants */
__m128i ROUND_CONST_Lx;
__m128i ROUND_CONST_L0[ROUNDS512];
__m128i ROUND_CONST_L7[ROUNDS512];
__m128i ROUND_CONST_P[ROUNDS1024];
__m128i ROUND_CONST_Q[ROUNDS1024];
__m128i TRANSP_MASK;
__m128i SUBSH_MASK[8];
__m128i ALL_1B;
__m128i ALL_FF;
#define tos(a) #a
#define tostr(a) tos(a)
/* xmm[i] will be multiplied by 2
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
j = _mm_xor_si128(j, j);\
j = _mm_cmpgt_epi8(j, i);\
i = _mm_add_epi8(i, i);\
j = _mm_and_si128(j, k);\
i = _mm_xor_si128(i, j);\
}
/**/
/* Yet another implementation of MixBytes.
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
Input: a0, ..., a7
Output: b0, ..., b7 = MixBytes(a0,...,a7).
but we use the relations:
t_i = a_i + a_{i+3}
x_i = t_i + t_{i+3}
y_i = t_i + t+{i+2} + a_{i+6}
z_i = 2*x_i
w_i = z_i + y_{i+4}
v_i = 2*w_i
b_i = v_{i+3} + y_{i+4}
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
and then adding v_i computed in the meantime in registers xmm0..xmm7.
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm_xor_si128(a0, a1);\
b0 = a2;\
a1 = _mm_xor_si128(a1, a2);\
b1 = a3;\
a2 = _mm_xor_si128(a2, a3);\
b2 = a4;\
a3 = _mm_xor_si128(a3, a4);\
b3 = a5;\
a4 = _mm_xor_si128(a4, a5);\
b4 = a6;\
a5 = _mm_xor_si128(a5, a6);\
b5 = a7;\
a6 = _mm_xor_si128(a6, a7);\
a7 = _mm_xor_si128(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm_xor_si128(b0, a4);\
b6 = _mm_xor_si128(b6, a4);\
b1 = _mm_xor_si128(b1, a5);\
b7 = _mm_xor_si128(b7, a5);\
b2 = _mm_xor_si128(b2, a6);\
b0 = _mm_xor_si128(b0, a6);\
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm_xor_si128(b3, a7);\
b1 = _mm_xor_si128(b1, a7);\
TEMP1 = b1;\
b4 = _mm_xor_si128(b4, a0);\
b2 = _mm_xor_si128(b2, a0);\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm_xor_si128(b5, a1);\
b3 = _mm_xor_si128(b3, a1);\
b1 = a1;\
b6 = _mm_xor_si128(b6, a2);\
b4 = _mm_xor_si128(b4, a2);\
TEMP2 = a2;\
b7 = _mm_xor_si128(b7, a3);\
b5 = _mm_xor_si128(b5, a3);\
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm_xor_si128(a0, a3);\
a1 = _mm_xor_si128(a1, a4);\
a2 = _mm_xor_si128(a2, a5);\
a3 = _mm_xor_si128(a3, a6);\
a4 = _mm_xor_si128(a4, a7);\
a5 = _mm_xor_si128(a5, b0);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, TEMP2);\
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = ALL_1B;\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm_xor_si128(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm_xor_si128(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm_xor_si128(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm_xor_si128(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm_xor_si128(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm_xor_si128(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm_xor_si128(a7, b7);\
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm_xor_si128(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm_xor_si128(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm_xor_si128(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm_xor_si128(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm_xor_si128(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm_xor_si128(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm_xor_si128(b0, a3);\
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#if (LENGTH <= 256)
#define SET_CONSTANTS(){\
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
for(i = 0; i < ROUNDS512; i++)\
{\
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
}\
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
}while(0); \
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = ROUND_CONST_Lx;\
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
a1 = _mm_xor_si128(a1, b1);\
a2 = _mm_xor_si128(a2, b1);\
a3 = _mm_xor_si128(a3, b1);\
a4 = _mm_xor_si128(a4, b1);\
a5 = _mm_xor_si128(a5, b1);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
\
/* ShiftBytes + SubBytes (interleaved) */\
b0 = _mm_xor_si128(b0, b0);\
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
a0 = _mm_aesenclast_si128(a0, b0);\
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
a1 = _mm_aesenclast_si128(a1, b0);\
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
a2 = _mm_aesenclast_si128(a2, b0);\
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
a3 = _mm_aesenclast_si128(a3, b0);\
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
a4 = _mm_aesenclast_si128(a4, b0);\
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
a5 = _mm_aesenclast_si128(a5, b0);\
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
a6 = _mm_aesenclast_si128(a6, b0);\
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
a7 = _mm_aesenclast_si128(a7, b0);\
\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
\
}
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
t0 = TRANSP_MASK;\
\
i0 = _mm_shuffle_epi8(i0, t0);\
i1 = _mm_shuffle_epi8(i1, t0);\
i2 = _mm_shuffle_epi8(i2, t0);\
i3 = _mm_shuffle_epi8(i3, t0);\
\
o1 = i0;\
t0 = i2;\
\
i0 = _mm_unpacklo_epi16(i0, i1);\
o1 = _mm_unpackhi_epi16(o1, i1);\
i2 = _mm_unpacklo_epi16(i2, i3);\
t0 = _mm_unpackhi_epi16(t0, i3);\
\
i0 = _mm_shuffle_epi32(i0, 216);\
o1 = _mm_shuffle_epi32(o1, 216);\
i2 = _mm_shuffle_epi32(i2, 216);\
t0 = _mm_shuffle_epi32(t0, 216);\
\
o2 = i0;\
o3 = o1;\
\
i0 = _mm_unpacklo_epi32(i0, i2);\
o1 = _mm_unpacklo_epi32(o1, t0);\
o2 = _mm_unpackhi_epi32(o2, i2);\
o3 = _mm_unpackhi_epi32(o3, t0);\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
o1 = i0;\
o2 = i1;\
i0 = _mm_unpacklo_epi64(i0, i4);\
o1 = _mm_unpackhi_epi64(o1, i4);\
o3 = i1;\
o4 = i2;\
o2 = _mm_unpacklo_epi64(o2, i5);\
o3 = _mm_unpackhi_epi64(o3, i5);\
o5 = i2;\
o6 = i3;\
o4 = _mm_unpacklo_epi64(o4, i6);\
o5 = _mm_unpackhi_epi64(o5, i6);\
o7 = i3;\
o6 = _mm_unpacklo_epi64(o6, i7);\
o7 = _mm_unpackhi_epi64(o7, i7);\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
o0 = i0;\
i0 = _mm_unpacklo_epi64(i0, i1);\
o0 = _mm_unpackhi_epi64(o0, i1);\
o1 = i2;\
i2 = _mm_unpacklo_epi64(i2, i3);\
o1 = _mm_unpackhi_epi64(o1, i3);\
o2 = i4;\
i4 = _mm_unpacklo_epi64(i4, i5);\
o2 = _mm_unpackhi_epi64(o2, i5);\
o3 = i6;\
i6 = _mm_unpacklo_epi64(i6, i7);\
o3 = _mm_unpackhi_epi64(o3, i7);\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
t0 = _mm_xor_si128(t0, t0);\
i1 = i0;\
i3 = i2;\
i5 = i4;\
i7 = i6;\
i0 = _mm_unpacklo_epi64(i0, t0);\
i1 = _mm_unpackhi_epi64(i1, t0);\
i2 = _mm_unpacklo_epi64(i2, t0);\
i3 = _mm_unpackhi_epi64(i3, t0);\
i4 = _mm_unpacklo_epi64(i4, t0);\
i5 = _mm_unpackhi_epi64(i5, t0);\
i6 = _mm_unpacklo_epi64(i6, t0);\
i7 = _mm_unpackhi_epi64(i7, t0);\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
i0 = _mm_unpacklo_epi64(i0, i1);\
i2 = _mm_unpacklo_epi64(i2, i3);\
i4 = _mm_unpacklo_epi64(i4, i5);\
i6 = _mm_unpacklo_epi64(i6, i7);\
endif\
}/**/
void INIT(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
/* load IV into registers xmm12 - xmm15 */
xmm12 = chaining[0];
xmm13 = chaining[1];
xmm14 = chaining[2];
xmm15 = chaining[3];
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* store transposed IV */
chaining[0] = xmm12;
chaining[1] = xmm2;
chaining[2] = xmm6;
chaining[3] = xmm7;
}
void TF512(u64* h, u64* m)
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
#ifdef IACA_TRACE
IACA_START;
#endif
/* load message into registers xmm12 - xmm15 */
xmm12 = message[0];
xmm13 = message[1];
xmm14 = message[2];
xmm15 = message[3];
/* transform message M from column ordering into row ordering */
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* load previous chaining value */
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
xmm8 = chaining[0];
xmm0 = chaining[1];
xmm4 = chaining[2];
xmm5 = chaining[3];
/* xor message to CV get input of P */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
xmm8 = _mm_xor_si128(xmm8, xmm12);
xmm0 = _mm_xor_si128(xmm0, xmm2);
xmm4 = _mm_xor_si128(xmm4, xmm6);
xmm5 = _mm_xor_si128(xmm5, xmm7);
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, xmm8);
xmm1 = _mm_xor_si128(xmm1, xmm10);
xmm2 = _mm_xor_si128(xmm2, xmm12);
xmm3 = _mm_xor_si128(xmm3, xmm14);
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
/* store CV */
chaining[0] = xmm0;
chaining[1] = xmm1;
chaining[2] = xmm2;
chaining[3] = xmm3;
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
xmm8 = chaining[0];
xmm10 = chaining[1];
xmm12 = chaining[2];
xmm14 = chaining[3];
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
/* we only need to return the truncated half of the state */
chaining[2] = xmm9;
chaining[3] = xmm11;
}
#endif
#if (LENGTH > 256)
#define SET_CONSTANTS(){\
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
for(i = 0; i < ROUNDS1024; i++)\
{\
ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
}\
}while(0);\
/* one round
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* SubBytes */\
b0 = _mm_xor_si128(b0, b0);\
a0 = _mm_aesenclast_si128(a0, b0);\
a1 = _mm_aesenclast_si128(a1, b0);\
a2 = _mm_aesenclast_si128(a2, b0);\
a3 = _mm_aesenclast_si128(a3, b0);\
a4 = _mm_aesenclast_si128(a4, b0);\
a5 = _mm_aesenclast_si128(a5, b0);\
a6 = _mm_aesenclast_si128(a6, b0);\
a7 = _mm_aesenclast_si128(a7, b0);\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
}
#define ROUNDS_P(){\
u8 round_counter = 0;\
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
/* AddRoundConstant P1024 */\
xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
/* SubBytes + MixBytes */\
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
\
/* AddRoundConstant P1024 */\
xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
/* ShiftBytes P1024 + pre-AESENCLAST */\
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
/* SubBytes + MixBytes */\
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
}\
}
#define ROUNDS_Q(){\
u8 round_counter = 0;\
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
/* AddRoundConstant Q1024 */\
xmm1 = ALL_FF;\
xmm8 = _mm_xor_si128(xmm8, xmm1);\
xmm9 = _mm_xor_si128(xmm9, xmm1);\
xmm10 = _mm_xor_si128(xmm10, xmm1);\
xmm11 = _mm_xor_si128(xmm11, xmm1);\
xmm12 = _mm_xor_si128(xmm12, xmm1);\
xmm13 = _mm_xor_si128(xmm13, xmm1);\
xmm14 = _mm_xor_si128(xmm14, xmm1);\
xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
/* SubBytes + MixBytes */\
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
\
/* AddRoundConstant Q1024 */\
xmm9 = ALL_FF;\
xmm0 = _mm_xor_si128(xmm0, xmm9);\
xmm1 = _mm_xor_si128(xmm1, xmm9);\
xmm2 = _mm_xor_si128(xmm2, xmm9);\
xmm3 = _mm_xor_si128(xmm3, xmm9);\
xmm4 = _mm_xor_si128(xmm4, xmm9);\
xmm5 = _mm_xor_si128(xmm5, xmm9);\
xmm6 = _mm_xor_si128(xmm6, xmm9);\
xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\
/* ShiftBytes Q1024 + pre-AESENCLAST */\
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
/* SubBytes + MixBytes */\
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
}\
}
/* Matrix Transpose
* input is a 1024-bit state with two columns in one xmm
* output is a 1024-bit state with two rows in one xmm
* inputs: i0-i7
* outputs: i0-i7
* clobbers: t0-t7
*/
#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
t0 = TRANSP_MASK;\
\
i6 = _mm_shuffle_epi8(i6, t0);\
i0 = _mm_shuffle_epi8(i0, t0);\
i1 = _mm_shuffle_epi8(i1, t0);\
i2 = _mm_shuffle_epi8(i2, t0);\
i3 = _mm_shuffle_epi8(i3, t0);\
t1 = i2;\
i4 = _mm_shuffle_epi8(i4, t0);\
i5 = _mm_shuffle_epi8(i5, t0);\
t2 = i4;\
t3 = i6;\
i7 = _mm_shuffle_epi8(i7, t0);\
\
/* continue with unpack using 4 temp registers */\
t0 = i0;\
t2 = _mm_unpackhi_epi16(t2, i5);\
i4 = _mm_unpacklo_epi16(i4, i5);\
t3 = _mm_unpackhi_epi16(t3, i7);\
i6 = _mm_unpacklo_epi16(i6, i7);\
t0 = _mm_unpackhi_epi16(t0, i1);\
t1 = _mm_unpackhi_epi16(t1, i3);\
i2 = _mm_unpacklo_epi16(i2, i3);\
i0 = _mm_unpacklo_epi16(i0, i1);\
\
/* shuffle with immediate */\
t0 = _mm_shuffle_epi32(t0, 216);\
t1 = _mm_shuffle_epi32(t1, 216);\
t2 = _mm_shuffle_epi32(t2, 216);\
t3 = _mm_shuffle_epi32(t3, 216);\
i0 = _mm_shuffle_epi32(i0, 216);\
i2 = _mm_shuffle_epi32(i2, 216);\
i4 = _mm_shuffle_epi32(i4, 216);\
i6 = _mm_shuffle_epi32(i6, 216);\
\
/* continue with unpack */\
t4 = i0;\
i0 = _mm_unpacklo_epi32(i0, i2);\
t4 = _mm_unpackhi_epi32(t4, i2);\
t5 = t0;\
t0 = _mm_unpacklo_epi32(t0, t1);\
t5 = _mm_unpackhi_epi32(t5, t1);\
t6 = i4;\
i4 = _mm_unpacklo_epi32(i4, i6);\
t7 = t2;\
t6 = _mm_unpackhi_epi32(t6, i6);\
i2 = t0;\
t2 = _mm_unpacklo_epi32(t2, t3);\
i3 = t0;\
t7 = _mm_unpackhi_epi32(t7, t3);\
\
/* there are now 2 rows in each xmm */\
/* unpack to get 1 row of CV in each xmm */\
i1 = i0;\
i1 = _mm_unpackhi_epi64(i1, i4);\
i0 = _mm_unpacklo_epi64(i0, i4);\
i4 = t4;\
i3 = _mm_unpackhi_epi64(i3, t2);\
i5 = t4;\
i2 = _mm_unpacklo_epi64(i2, t2);\
i6 = t5;\
i5 = _mm_unpackhi_epi64(i5, t6);\
i7 = t5;\
i4 = _mm_unpacklo_epi64(i4, t6);\
i7 = _mm_unpackhi_epi64(i7, t7);\
i6 = _mm_unpacklo_epi64(i6, t7);\
/* transpose done */\
}/**/
/* Matrix Transpose Inverse
* input is a 1024-bit state with two rows in one xmm
* output is a 1024-bit state with two columns in one xmm
* inputs: i0-i7
* outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
* clobbers: t0-t4
*/
#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
/* transpose matrix to get output format */\
o1 = i0;\
i0 = _mm_unpacklo_epi64(i0, i1);\
o1 = _mm_unpackhi_epi64(o1, i1);\
t0 = i2;\
i2 = _mm_unpacklo_epi64(i2, i3);\
t0 = _mm_unpackhi_epi64(t0, i3);\
t1 = i4;\
i4 = _mm_unpacklo_epi64(i4, i5);\
t1 = _mm_unpackhi_epi64(t1, i5);\
t2 = i6;\
o0 = TRANSP_MASK;\
i6 = _mm_unpacklo_epi64(i6, i7);\
t2 = _mm_unpackhi_epi64(t2, i7);\
/* load transpose mask into a register, because it will be used 8 times */\
i0 = _mm_shuffle_epi8(i0, o0);\
i2 = _mm_shuffle_epi8(i2, o0);\
i4 = _mm_shuffle_epi8(i4, o0);\
i6 = _mm_shuffle_epi8(i6, o0);\
o1 = _mm_shuffle_epi8(o1, o0);\
t0 = _mm_shuffle_epi8(t0, o0);\
t1 = _mm_shuffle_epi8(t1, o0);\
t2 = _mm_shuffle_epi8(t2, o0);\
/* continue with unpack using 4 temp registers */\
t3 = i4;\
o2 = o1;\
o0 = i0;\
t4 = t1;\
\
t3 = _mm_unpackhi_epi16(t3, i6);\
i4 = _mm_unpacklo_epi16(i4, i6);\
o0 = _mm_unpackhi_epi16(o0, i2);\
i0 = _mm_unpacklo_epi16(i0, i2);\
o2 = _mm_unpackhi_epi16(o2, t0);\
o1 = _mm_unpacklo_epi16(o1, t0);\
t4 = _mm_unpackhi_epi16(t4, t2);\
t1 = _mm_unpacklo_epi16(t1, t2);\
/* shuffle with immediate */\
i4 = _mm_shuffle_epi32(i4, 216);\
t3 = _mm_shuffle_epi32(t3, 216);\
o1 = _mm_shuffle_epi32(o1, 216);\
o2 = _mm_shuffle_epi32(o2, 216);\
i0 = _mm_shuffle_epi32(i0, 216);\
o0 = _mm_shuffle_epi32(o0, 216);\
t1 = _mm_shuffle_epi32(t1, 216);\
t4 = _mm_shuffle_epi32(t4, 216);\
/* continue with unpack */\
i1 = i0;\
i3 = o0;\
i5 = o1;\
i7 = o2;\
i0 = _mm_unpacklo_epi32(i0, i4);\
i1 = _mm_unpackhi_epi32(i1, i4);\
o0 = _mm_unpacklo_epi32(o0, t3);\
i3 = _mm_unpackhi_epi32(i3, t3);\
o1 = _mm_unpacklo_epi32(o1, t1);\
i5 = _mm_unpackhi_epi32(i5, t1);\
o2 = _mm_unpacklo_epi32(o2, t4);\
i7 = _mm_unpackhi_epi32(i7, t4);\
/* transpose done */\
}/**/
void INIT(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
/* load IV into registers xmm8 - xmm15 */
xmm8 = chaining[0];
xmm9 = chaining[1];
xmm10 = chaining[2];
xmm11 = chaining[3];
xmm12 = chaining[4];
xmm13 = chaining[5];
xmm14 = chaining[6];
xmm15 = chaining[7];
/* transform chaining value from column ordering into row ordering */
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
/* store transposed IV */
chaining[0] = xmm8;
chaining[1] = xmm9;
chaining[2] = xmm10;
chaining[3] = xmm11;
chaining[4] = xmm12;
chaining[5] = xmm13;
chaining[6] = xmm14;
chaining[7] = xmm15;
}
void TF1024(u64* h, u64* m)
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i QTEMP[8];
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
#ifdef IACA_TRACE
IACA_START;
#endif
/* load message into registers xmm8 - xmm15 (Q = message) */
xmm8 = message[0];
xmm9 = message[1];
xmm10 = message[2];
xmm11 = message[3];
xmm12 = message[4];
xmm13 = message[5];
xmm14 = message[6];
xmm15 = message[7];
/* transform message M from column ordering into row ordering */
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
/* store message M (Q input) for later */
QTEMP[0] = xmm8;
QTEMP[1] = xmm9;
QTEMP[2] = xmm10;
QTEMP[3] = xmm11;
QTEMP[4] = xmm12;
QTEMP[5] = xmm13;
QTEMP[6] = xmm14;
QTEMP[7] = xmm15;
/* xor CV to message to get P input */
/* result: CV+M in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* compute permutation P */
/* result: P(CV+M) in xmm8...xmm15 */
ROUNDS_P();
/* xor CV to P output (feed-forward) */
/* result: P(CV+M)+CV in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* store P(CV+M)+CV */
chaining[0] = xmm8;
chaining[1] = xmm9;
chaining[2] = xmm10;
chaining[3] = xmm11;
chaining[4] = xmm12;
chaining[5] = xmm13;
chaining[6] = xmm14;
chaining[7] = xmm15;
/* load message M (Q input) into xmm8-15 */
xmm8 = QTEMP[0];
xmm9 = QTEMP[1];
xmm10 = QTEMP[2];
xmm11 = QTEMP[3];
xmm12 = QTEMP[4];
xmm13 = QTEMP[5];
xmm14 = QTEMP[6];
xmm15 = QTEMP[7];
/* compute permutation Q */
/* result: Q(M) in xmm8...xmm15 */
ROUNDS_Q();
/* xor Q output */
/* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* store CV */
chaining[0] = xmm8;
chaining[1] = xmm9;
chaining[2] = xmm10;
chaining[3] = xmm11;
chaining[4] = xmm12;
chaining[5] = xmm13;
chaining[6] = xmm14;
chaining[7] = xmm15;
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF1024(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
/* load CV into registers xmm8 - xmm15 */
xmm8 = chaining[0];
xmm9 = chaining[1];
xmm10 = chaining[2];
xmm11 = chaining[3];
xmm12 = chaining[4];
xmm13 = chaining[5];
xmm14 = chaining[6];
xmm15 = chaining[7];
/* compute permutation P */
/* result: P(CV) in xmm8...xmm15 */
ROUNDS_P();
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8...xmm15 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
/* transpose CV back from row ordering to column ordering */
/* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
/* we only need to return the truncated half of the state */
chaining[4] = xmm0;
chaining[5] = xmm6;
chaining[6] = xmm13;
chaining[7] = xmm15;
return;
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,16 @@
// specify assembly or intrinsics implementation
//#define TASM
#define TINTR
//#define AES_NI
//#ifdef AES_NI
// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
//#ifndef NO_AES_NI
#define VAES
// #define VAVX
// #define VVPERM
//#endif

View File

@@ -0,0 +1,529 @@
/* groestl-asm-aes.h Aug 2011
*
* Groestl implementation with inline assembly using ssse3, sse4.1, and aes
* instructions.
* Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
*
* This code is placed in the public domain
*/
#include "hash-groestl256.h"
/* global constants */
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
/* temporary variables */
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
#define tos(a) #a
#define tostr(a) tos(a)
/* xmm[i] will be multiplied by 2
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
}/**/
/* Yet another implementation of MixBytes.
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
Input: a0, ..., a7
Output: b0, ..., b7 = MixBytes(a0,...,a7).
but we use the relations:
t_i = a_i + a_{i+3}
x_i = t_i + t_{i+3}
y_i = t_i + t+{i+2} + a_{i+6}
z_i = 2*x_i
w_i = z_i + y_{i+4}
v_i = 2*w_i
b_i = v_{i+3} + y_{i+4}
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
and then adding v_i computed in the meantime in registers xmm0..xmm7.
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
/* spill values y_4, y_5 to memory */\
asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
\
/* compute x_i = t_i + t_{i+3} */\
asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
asm("movaps xmm"tostr(b1)", [ALL_1B]");\
MUL2(a0, b0, b1);\
asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
MUL2(a1, b0, b1);\
asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
MUL2(a2, b0, b1);\
asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
MUL2(a3, b0, b1);\
asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
MUL2(a4, b0, b1);\
asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
MUL2(a5, b0, b1);\
asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
MUL2(a6, b0, b1);\
asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
MUL2(a7, b0, b1);\
asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
MUL2(a1, b0, b1);\
asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
MUL2(a2, b0, b1);\
asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
MUL2(a5, b0, b1);\
asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
MUL2(a6, b0, b1);\
asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
MUL2(a7, b0, b1);\
asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
}/*MixBytes*/
#define SET_CONSTANTS(){\
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
for(i = 0; i < ROUNDS512; i++)\
{\
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
}\
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
}while(0);
#define Push_All_Regs() do{\
/* not using any...
asm("push rax");\
asm("push rbx");\
asm("push rcx");*/\
}while(0);
#define Pop_All_Regs() do{\
/* not using any...
asm("pop rcx");\
asm("pop rbx");\
asm("pop rax");*/\
}while(0);
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
/* ShiftBytes + SubBytes (interleaved) */\
asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
}
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
\
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
\
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
\
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
\
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
\
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
\
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
}/**/
void INIT256(u64* h)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
asm (".intel_syntax noprefix");
asm volatile ("emms");
/* load IV into registers xmm12 - xmm15 */
asm ("movaps xmm12, [rdi+0*16]");
asm ("movaps xmm13, [rdi+1*16]");
asm ("movaps xmm14, [rdi+2*16]");
asm ("movaps xmm15, [rdi+3*16]");
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
/* store transposed IV */
asm ("movaps [rdi+0*16], xmm12");
asm ("movaps [rdi+1*16], xmm2");
asm ("movaps [rdi+2*16], xmm6");
asm ("movaps [rdi+3*16], xmm7");
asm volatile ("emms");
asm (".att_syntax noprefix");
}
void TF512(u64* h, u64* m)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
/* message M in rsi */
#ifdef IACA_TRACE
IACA_START;
#endif
asm (".intel_syntax noprefix");
Push_All_Regs();
/* load message into registers xmm12 - xmm15 (Q = message) */
asm ("movaps xmm12, [rsi+0*16]");
asm ("movaps xmm13, [rsi+1*16]");
asm ("movaps xmm14, [rsi+2*16]");
asm ("movaps xmm15, [rsi+3*16]");
/* transform message M from column ordering into row ordering */
/* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
/* load previous chaining value */
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
asm ("movaps xmm8, [rdi+0*16]");
asm ("movaps xmm0, [rdi+1*16]");
asm ("movaps xmm4, [rdi+2*16]");
asm ("movaps xmm5, [rdi+3*16]");
/* xor message to CV get input of P */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
asm ("pxor xmm8, xmm12");
asm ("pxor xmm0, xmm2");
asm ("pxor xmm4, xmm6");
asm ("pxor xmm5, xmm7");
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
asm ("pxor xmm0, xmm8");
asm ("pxor xmm1, xmm10");
asm ("pxor xmm2, xmm12");
asm ("pxor xmm3, xmm14");
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
asm ("pxor xmm0, [rdi+0*16]");
asm ("pxor xmm1, [rdi+1*16]");
asm ("pxor xmm2, [rdi+2*16]");
asm ("pxor xmm3, [rdi+3*16]");
/* store CV */
asm ("movaps [rdi+0*16], xmm0");
asm ("movaps [rdi+1*16], xmm1");
asm ("movaps [rdi+2*16], xmm2");
asm ("movaps [rdi+3*16], xmm3");
Pop_All_Regs();
asm (".att_syntax noprefix");
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
asm (".intel_syntax noprefix");
Push_All_Regs();
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
asm ("movaps xmm8, [rdi+0*16]");
asm ("movaps xmm10, [rdi+1*16]");
asm ("movaps xmm12, [rdi+2*16]");
asm ("movaps xmm14, [rdi+3*16]");
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
asm ("pxor xmm8, [rdi+0*16]");
asm ("pxor xmm10, [rdi+1*16]");
asm ("pxor xmm12, [rdi+2*16]");
asm ("pxor xmm14, [rdi+3*16]");
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
/* we only need to return the truncated half of the state */
asm ("movaps [rdi+2*16], xmm9");
asm ("movaps [rdi+3*16], xmm11");
Pop_All_Regs();
asm (".att_syntax noprefix");
return;
}

View File

@@ -0,0 +1,519 @@
/* groestl-asm-avx.h Aug 2011
*
* Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
* instructions.
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
*
* This code is placed in the public domain
*/
#include "hash-groestl256.h"
/* global variables */
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
/* temporary variables */
__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
#define tos(a) #a
#define tostr(a) tos(a)
#define SET_CONSTANTS(){\
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
for(i = 0; i < ROUNDS512; i++)\
{\
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
}\
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
}while(0);
#define Push_All_Regs() do{\
/* not using any...
asm("push rax");\
asm("push rbx");\
asm("push rcx");*/\
}while(0);
#define Pop_All_Regs() do{\
/* not using any...
asm("pop rcx");\
asm("pop rbx");\
asm("pop rax");*/\
}while(0);
/* xmm[i] will be multiplied by 2
* xmm[j] will be lost
* xmm[k] has to be all 0x1b
* xmm[z] has to be zero */
#define VMUL2(i, j, k, z){\
asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
}/**/
/* xmm[i] will be multiplied by 2
* xmm[j] will be lost
* xmm[k] has to be all 0x1b
* xmm[z] has to be zero */
#define VMUL2v2(i, j, k, z){\
asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
}/**/
/* Yet another implementation of MixBytes.
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
Input: a0, ..., a7
Output: b0, ..., b7 = MixBytes(a0,...,a7).
but we use the relations:
t_i = a_i + a_{i+3}
x_i = t_i + t_{i+3}
y_i = t_i + t+{i+2} + a_{i+6}
z_i = 2*x_i
w_i = z_i + y_{i+4}
v_i = 2*w_i
b_i = v_{i+3} + y_{i+4}
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
and then adding v_i computed in the meantime in registers xmm0..xmm7.
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
\
/* t_i = a_i + a_{i+1} */\
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
\
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
\
/* spill values y_4, y_5 to memory */\
asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
\
/* compute x_i = t_i + t_{i+3} */\
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
\
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
VMUL2(a7, b0, b1, b2);\
VMUL2(a6, b0, b1, b2);\
VMUL2(a5, b0, b1, b2);\
VMUL2(a4, b0, b1, b2);\
VMUL2(a3, b0, b1, b2);\
VMUL2(a2, b0, b1, b2);\
VMUL2(a1, b0, b1, b2);\
VMUL2(a0, b0, b1, b2);\
\
/* compute w_i : add y_{i+4} */\
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
\
/*compute v_i: double w_i */\
VMUL2(a0, b0, b1, b2);\
VMUL2(a1, b0, b1, b2);\
VMUL2(a2, b0, b1, b2);\
VMUL2(a3, b0, b1, b2);\
VMUL2(a4, b0, b1, b2);\
VMUL2(a5, b0, b1, b2);\
VMUL2(a6, b0, b1, b2);\
VMUL2(a7, b0, b1, b2);\
\
/* add to y_4 y_5 .. v3, v4, ... */\
asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
}/*MixBytes*/
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
/* ShiftBytes + SubBytes (interleaved) */\
asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
}
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
\
asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
\
asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
\
asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
\
asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
}/**/
void INIT256(u64* h)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
asm (".intel_syntax noprefix");
asm volatile ("emms");
/* load IV into registers xmm12 - xmm15 */
asm ("vmovaps xmm12, [rdi+0*16]");
asm ("vmovaps xmm13, [rdi+1*16]");
asm ("vmovaps xmm14, [rdi+2*16]");
asm ("vmovaps xmm15, [rdi+3*16]");
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
/* store transposed IV */
asm ("vmovaps [rdi+0*16], xmm12");
asm ("vmovaps [rdi+1*16], xmm2");
asm ("vmovaps [rdi+2*16], xmm6");
asm ("vmovaps [rdi+3*16], xmm7");
asm volatile ("emms");
asm (".att_syntax noprefix");
}
void TF512(u64* h, u64* m)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
/* message M in rsi */
#ifdef IACA_TRACE
IACA_START;
#endif
asm (".intel_syntax noprefix");
Push_All_Regs();
/* load message into registers xmm12 - xmm15 (Q = message) */
asm ("vmovaps xmm12, [rsi+0*16]");
asm ("vmovaps xmm13, [rsi+1*16]");
asm ("vmovaps xmm14, [rsi+2*16]");
asm ("vmovaps xmm15, [rsi+3*16]");
/* transform message M from column ordering into row ordering */
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
/* load previous chaining value and xor message to CV to get input of P */
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
asm ("vpxor xmm8, xmm12, [rdi+0*16]");
asm ("vpxor xmm0, xmm2, [rdi+1*16]");
asm ("vpxor xmm4, xmm6, [rdi+2*16]");
asm ("vpxor xmm5, xmm7, [rdi+3*16]");
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
asm ("vpxor xmm0, xmm0, xmm8");
asm ("vpxor xmm1, xmm1, xmm10");
asm ("vpxor xmm2, xmm2, xmm12");
asm ("vpxor xmm3, xmm3, xmm14");
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
asm ("vpxor xmm0, xmm0, [rdi+0*16]");
asm ("vpxor xmm1, xmm1, [rdi+1*16]");
asm ("vpxor xmm2, xmm2, [rdi+2*16]");
asm ("vpxor xmm3, xmm3, [rdi+3*16]");
/* store CV */
asm ("vmovaps [rdi+0*16], xmm0");
asm ("vmovaps [rdi+1*16], xmm1");
asm ("vmovaps [rdi+2*16], xmm2");
asm ("vmovaps [rdi+3*16], xmm3");
Pop_All_Regs();
asm (".att_syntax noprefix");
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
asm (".intel_syntax noprefix");
Push_All_Regs();
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
asm ("vmovaps xmm8, [rdi+0*16]");
asm ("vmovaps xmm10, [rdi+1*16]");
asm ("vmovaps xmm12, [rdi+2*16]");
asm ("vmovaps xmm14, [rdi+3*16]");
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
asm ("vpxor xmm8, xmm8, [rdi+0*16]");
asm ("vpxor xmm10, xmm10, [rdi+1*16]");
asm ("vpxor xmm12, xmm12, [rdi+2*16]");
asm ("vpxor xmm14, xmm14, [rdi+3*16]");
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
/* we only need to return the truncated half of the state */
asm ("vmovaps [rdi+2*16], xmm9");
asm ("vmovaps [rdi+3*16], xmm11");
Pop_All_Regs();
asm (".att_syntax noprefix");
return;
}

View File

@@ -0,0 +1,856 @@
/* groestl-asm-vperm.h Aug 2011
*
* Groestl implementation with inline assembly using ssse3 instructions.
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
*
* Based on the vperm and aes_ni implementations of the hash function Groestl
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
* Institute of Applied Mathematics, Middle East Technical University, Turkey
*
* This code is placed in the public domain
*/
#include "hash-groestl256.h"
/* global constants */
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
__attribute__ ((aligned (16))) unsigned char ALL_15[16];
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
__attribute__ ((aligned (16))) unsigned char ALL_63[16];
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
/* temporary variables */
__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
#define tos(a) #a
#define tostr(a) tos(a)
#define SET_SHARED_CONSTANTS(){\
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
}/**/
/* VPERM
* Transform w/o settings c*
* transforms 2 rows to/from "vperm mode"
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0, a1 = 2 rows
* table = transformation table to use
* t*, c* = clobbers
* outputs:
* a0, a1 = 2 rows transformed with table
* */
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\
asm ("psrld xmm"tostr(t0)", 4");\
asm ("psrld xmm"tostr(t1)", 4");\
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\
asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\
asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\
}/**/
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
}/**/
/* VPERM
* Transform
* transforms 2 rows to/from "vperm mode"
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0, a1 = 2 rows
* table = transformation table to use
* t*, c* = clobbers
* outputs:
* a0, a1 = 2 rows transformed with table
* */
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
VPERM_Transform_Set_Const(table, c0, c1, c2);\
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
}/**/
/* VPERM
* Transform State
* inputs:
* a0-a3 = state
* table = transformation table to use
* t* = clobbers
* outputs:
* a0-a3 = transformed state
* */
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
VPERM_Transform_Set_Const(table, c0, c1, c2);\
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
}/**/
/* VPERM
* Add Constant to State
* inputs:
* a0-a7 = state
* constant = constant to add
* t0 = clobber
* outputs:
* a0-a7 = state + constant
* */
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\
}/**/
/* VPERM
* Set Substitute Core Constants
* */
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
}/**/
/* VPERM
* Substitute Core
* first part of sbox inverse computation
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0 = 1 row
* t*, c* = clobbers
* outputs:
* b0a, b0b = inputs for lookup step
* */
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
asm ("psrld xmm"tostr(t0)", 4");\
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\
asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\
asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\
asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\
}/**/
/* VPERM
* Lookup
* second part of sbox inverse computation
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0a, a0b = output of Substitution Core
* table = lookup table to use (*1 / *2 / *4)
* t0 = clobber
* outputs:
* b0 = output of sbox + multiplication
* */
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\
}/**/
/* VPERM
* SubBytes and *2 / *4
* this function is derived from:
* Constant-time SSSE3 AES core implementation
* by Mike Hamburg
* and
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0-a7 = state
* t*, c* = clobbers
* outputs:
* a0-a7 = state * 4
* c2 = row0 * 2 -> b0
* c1 = row7 * 2 -> b3
* c0 = row7 * 1 -> b4
* t2 = row4 * 1 -> b7
* TEMP_MUL1 = row(i) * 1
* TEMP_MUL2 = row(i) * 2
*
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
/* set Constants */\
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
/* row 1 */\
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
/* --- */\
/* row 2 */\
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
/* --- */\
/* row 3 */\
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
/* --- */\
/* row 5 */\
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
/* --- */\
/* row 6 */\
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
/* --- */\
/* row 7 */\
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
/* --- */\
/* row 4 */\
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
/* --- */\
/* row 0 */\
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
/* --- */\
}/**/
/* Optimized MixBytes
* inputs:
* a0-a7 = (row0-row7) * 4
* b0 = row0 * 2
* b3 = row7 * 2
* b4 = row7 * 1
* b7 = row4 * 1
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
* output: b0-b7
* */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* save one value */\
asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
/* 1 */\
asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
\
/* 2 */\
asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\
asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
\
/* 4 */\
asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\
/*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\
\
/* 3 */\
asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
/*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\
asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
\
/* 5 */\
asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
/*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\
asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\
\
/* 6 */\
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
\
/* 7 */\
asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
\
/* 8 */\
asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
\
/* 9 */\
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
\
/* 10 */\
asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\
asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\
\
/* 11 */\
asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
\
/* 12 */\
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
\
/* 13 */\
asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
}/**/
//#if (LENGTH <= 256)
#define SET_CONSTANTS(){\
SET_SHARED_CONSTANTS();\
((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
for(i = 0; i < ROUNDS512; i++)\
{\
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
}\
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
}/**/
#define Push_All_Regs(){\
/* not using any...
asm("push rax");\
asm("push rbx");\
asm("push rcx");*/\
}/**/
#define Pop_All_Regs(){\
/* not using any...
asm("pop rcx");\
asm("pop rbx");\
asm("pop rax");*/\
}/**/
/* vperm:
* transformation before rounds with ipt
* first round add transformed constant
* middle rounds: add constant XOR 0x15...15
* last round: additionally add 0x15...15 after MB
* transformation after rounds with opt
*/
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant + ShiftBytes (interleaved) */\
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
/* SubBytes + Multiplication by 2 and 4 */\
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
}/**/
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
\
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
\
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
\
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
\
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
\
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
\
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
}/**/
/* transform round constants into VPERM mode */
#define VPERM_Transform_RoundConst_CNT2(i, j){\
asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
asm ("pxor xmm0, [ALL_15]");\
asm ("pxor xmm1, [ALL_15]");\
asm ("pxor xmm2, [ALL_15]");\
asm ("pxor xmm3, [ALL_15]");\
asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
}/**/
/* transform round constants into VPERM mode */
#define VPERM_Transform_RoundConst(){\
asm ("movaps xmm0, [ROUND_CONST_Lx]");\
VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
asm ("pxor xmm0, [ALL_15]");\
asm ("movaps [ROUND_CONST_Lx], xmm0");\
VPERM_Transform_RoundConst_CNT2(0, 1);\
VPERM_Transform_RoundConst_CNT2(2, 3);\
VPERM_Transform_RoundConst_CNT2(4, 5);\
VPERM_Transform_RoundConst_CNT2(6, 7);\
VPERM_Transform_RoundConst_CNT2(8, 9);\
}/**/
void INIT256(u64* h)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
asm (".intel_syntax noprefix");
asm volatile ("emms");
/* transform round constants into VPERM mode */
VPERM_Transform_RoundConst();
/* load IV into registers xmm12 - xmm15 */
asm ("movaps xmm12, [rdi+0*16]");
asm ("movaps xmm13, [rdi+1*16]");
asm ("movaps xmm14, [rdi+2*16]");
asm ("movaps xmm15, [rdi+3*16]");
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
/* store transposed IV */
asm ("movaps [rdi+0*16], xmm12");
asm ("movaps [rdi+1*16], xmm2");
asm ("movaps [rdi+2*16], xmm6");
asm ("movaps [rdi+3*16], xmm7");
asm volatile ("emms");
asm (".att_syntax noprefix");
}
void TF512(u64* h, u64* m)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
/* message M in rsi */
#ifdef IACA_TRACE
IACA_START;
#endif
asm (".intel_syntax noprefix");
Push_All_Regs();
/* load message into registers xmm12 - xmm15 (Q = message) */
asm ("movaps xmm12, [rsi+0*16]");
asm ("movaps xmm13, [rsi+1*16]");
asm ("movaps xmm14, [rsi+2*16]");
asm ("movaps xmm15, [rsi+3*16]");
/* transform message M from column ordering into row ordering */
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
/* load previous chaining value */
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
asm ("movaps xmm8, [rdi+0*16]");
asm ("movaps xmm0, [rdi+1*16]");
asm ("movaps xmm4, [rdi+2*16]");
asm ("movaps xmm5, [rdi+3*16]");
/* xor message to CV get input of P */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
asm ("pxor xmm8, xmm12");
asm ("pxor xmm0, xmm2");
asm ("pxor xmm4, xmm6");
asm ("pxor xmm5, xmm7");
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
asm ("pxor xmm0, xmm8");
asm ("pxor xmm1, xmm10");
asm ("pxor xmm2, xmm12");
asm ("pxor xmm3, xmm14");
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
asm ("pxor xmm0, [rdi+0*16]");
asm ("pxor xmm1, [rdi+1*16]");
asm ("pxor xmm2, [rdi+2*16]");
asm ("pxor xmm3, [rdi+3*16]");
/* store CV */
asm ("movaps [rdi+0*16], xmm0");
asm ("movaps [rdi+1*16], xmm1");
asm ("movaps [rdi+2*16], xmm2");
asm ("movaps [rdi+3*16], xmm3");
Pop_All_Regs();
asm (".att_syntax noprefix");
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
/* __cdecl calling convention: */
/* chaining value CV in rdi */
asm (".intel_syntax noprefix");
Push_All_Regs();
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
asm ("movaps xmm8, [rdi+0*16]");
asm ("movaps xmm10, [rdi+1*16]");
asm ("movaps xmm12, [rdi+2*16]");
asm ("movaps xmm14, [rdi+3*16]");
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
asm ("pxor xmm8, [rdi+0*16]");
asm ("pxor xmm10, [rdi+1*16]");
asm ("pxor xmm12, [rdi+2*16]");
asm ("pxor xmm14, [rdi+3*16]");
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
/* we only need to return the truncated half of the state */
asm ("movaps [rdi+2*16], xmm9");
asm ("movaps [rdi+3*16], xmm11");
Pop_All_Regs();
asm (".att_syntax noprefix");
return;
}

View File

@@ -0,0 +1,496 @@
/* groestl-intr-aes.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
* instructions.
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
*
* This code is placed in the public domain
*/
#include <smmintrin.h>
#include <wmmintrin.h>
#include "hash-groestl256.h"
/* global constants */
__m128i ROUND_CONST_Lx;
__m128i ROUND_CONST_L0[ROUNDS512];
__m128i ROUND_CONST_L7[ROUNDS512];
__m128i ROUND_CONST_P[ROUNDS1024];
__m128i ROUND_CONST_Q[ROUNDS1024];
__m128i TRANSP_MASK;
__m128i SUBSH_MASK[8];
__m128i ALL_1B;
__m128i ALL_FF;
#define tos(a) #a
#define tostr(a) tos(a)
/* xmm[i] will be multiplied by 2
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
j = _mm_xor_si128(j, j);\
j = _mm_cmpgt_epi8(j, i);\
i = _mm_add_epi8(i, i);\
j = _mm_and_si128(j, k);\
i = _mm_xor_si128(i, j);\
}
/**/
/* Yet another implementation of MixBytes.
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
Input: a0, ..., a7
Output: b0, ..., b7 = MixBytes(a0,...,a7).
but we use the relations:
t_i = a_i + a_{i+3}
x_i = t_i + t_{i+3}
y_i = t_i + t+{i+2} + a_{i+6}
z_i = 2*x_i
w_i = z_i + y_{i+4}
v_i = 2*w_i
b_i = v_{i+3} + y_{i+4}
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
and then adding v_i computed in the meantime in registers xmm0..xmm7.
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm_xor_si128(a0, a1);\
b0 = a2;\
a1 = _mm_xor_si128(a1, a2);\
b1 = a3;\
a2 = _mm_xor_si128(a2, a3);\
b2 = a4;\
a3 = _mm_xor_si128(a3, a4);\
b3 = a5;\
a4 = _mm_xor_si128(a4, a5);\
b4 = a6;\
a5 = _mm_xor_si128(a5, a6);\
b5 = a7;\
a6 = _mm_xor_si128(a6, a7);\
a7 = _mm_xor_si128(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm_xor_si128(b0, a4);\
b6 = _mm_xor_si128(b6, a4);\
b1 = _mm_xor_si128(b1, a5);\
b7 = _mm_xor_si128(b7, a5);\
b2 = _mm_xor_si128(b2, a6);\
b0 = _mm_xor_si128(b0, a6);\
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm_xor_si128(b3, a7);\
b1 = _mm_xor_si128(b1, a7);\
TEMP1 = b1;\
b4 = _mm_xor_si128(b4, a0);\
b2 = _mm_xor_si128(b2, a0);\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm_xor_si128(b5, a1);\
b3 = _mm_xor_si128(b3, a1);\
b1 = a1;\
b6 = _mm_xor_si128(b6, a2);\
b4 = _mm_xor_si128(b4, a2);\
TEMP2 = a2;\
b7 = _mm_xor_si128(b7, a3);\
b5 = _mm_xor_si128(b5, a3);\
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm_xor_si128(a0, a3);\
a1 = _mm_xor_si128(a1, a4);\
a2 = _mm_xor_si128(a2, a5);\
a3 = _mm_xor_si128(a3, a6);\
a4 = _mm_xor_si128(a4, a7);\
a5 = _mm_xor_si128(a5, b0);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, TEMP2);\
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = ALL_1B;\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm_xor_si128(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm_xor_si128(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm_xor_si128(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm_xor_si128(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm_xor_si128(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm_xor_si128(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm_xor_si128(a7, b7);\
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm_xor_si128(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm_xor_si128(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm_xor_si128(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm_xor_si128(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm_xor_si128(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm_xor_si128(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm_xor_si128(b0, a3);\
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#define SET_CONSTANTS(){\
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
for(i = 0; i < ROUNDS512; i++)\
{\
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
}\
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
}while(0); \
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = ROUND_CONST_Lx;\
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
a1 = _mm_xor_si128(a1, b1);\
a2 = _mm_xor_si128(a2, b1);\
a3 = _mm_xor_si128(a3, b1);\
a4 = _mm_xor_si128(a4, b1);\
a5 = _mm_xor_si128(a5, b1);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
\
/* ShiftBytes + SubBytes (interleaved) */\
b0 = _mm_xor_si128(b0, b0);\
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
a0 = _mm_aesenclast_si128(a0, b0);\
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
a1 = _mm_aesenclast_si128(a1, b0);\
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
a2 = _mm_aesenclast_si128(a2, b0);\
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
a3 = _mm_aesenclast_si128(a3, b0);\
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
a4 = _mm_aesenclast_si128(a4, b0);\
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
a5 = _mm_aesenclast_si128(a5, b0);\
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
a6 = _mm_aesenclast_si128(a6, b0);\
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
a7 = _mm_aesenclast_si128(a7, b0);\
\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
\
}
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
t0 = TRANSP_MASK;\
\
i0 = _mm_shuffle_epi8(i0, t0);\
i1 = _mm_shuffle_epi8(i1, t0);\
i2 = _mm_shuffle_epi8(i2, t0);\
i3 = _mm_shuffle_epi8(i3, t0);\
\
o1 = i0;\
t0 = i2;\
\
i0 = _mm_unpacklo_epi16(i0, i1);\
o1 = _mm_unpackhi_epi16(o1, i1);\
i2 = _mm_unpacklo_epi16(i2, i3);\
t0 = _mm_unpackhi_epi16(t0, i3);\
\
i0 = _mm_shuffle_epi32(i0, 216);\
o1 = _mm_shuffle_epi32(o1, 216);\
i2 = _mm_shuffle_epi32(i2, 216);\
t0 = _mm_shuffle_epi32(t0, 216);\
\
o2 = i0;\
o3 = o1;\
\
i0 = _mm_unpacklo_epi32(i0, i2);\
o1 = _mm_unpacklo_epi32(o1, t0);\
o2 = _mm_unpackhi_epi32(o2, i2);\
o3 = _mm_unpackhi_epi32(o3, t0);\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
o1 = i0;\
o2 = i1;\
i0 = _mm_unpacklo_epi64(i0, i4);\
o1 = _mm_unpackhi_epi64(o1, i4);\
o3 = i1;\
o4 = i2;\
o2 = _mm_unpacklo_epi64(o2, i5);\
o3 = _mm_unpackhi_epi64(o3, i5);\
o5 = i2;\
o6 = i3;\
o4 = _mm_unpacklo_epi64(o4, i6);\
o5 = _mm_unpackhi_epi64(o5, i6);\
o7 = i3;\
o6 = _mm_unpacklo_epi64(o6, i7);\
o7 = _mm_unpackhi_epi64(o7, i7);\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
o0 = i0;\
i0 = _mm_unpacklo_epi64(i0, i1);\
o0 = _mm_unpackhi_epi64(o0, i1);\
o1 = i2;\
i2 = _mm_unpacklo_epi64(i2, i3);\
o1 = _mm_unpackhi_epi64(o1, i3);\
o2 = i4;\
i4 = _mm_unpacklo_epi64(i4, i5);\
o2 = _mm_unpackhi_epi64(o2, i5);\
o3 = i6;\
i6 = _mm_unpacklo_epi64(i6, i7);\
o3 = _mm_unpackhi_epi64(o3, i7);\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
t0 = _mm_xor_si128(t0, t0);\
i1 = i0;\
i3 = i2;\
i5 = i4;\
i7 = i6;\
i0 = _mm_unpacklo_epi64(i0, t0);\
i1 = _mm_unpackhi_epi64(i1, t0);\
i2 = _mm_unpacklo_epi64(i2, t0);\
i3 = _mm_unpackhi_epi64(i3, t0);\
i4 = _mm_unpacklo_epi64(i4, t0);\
i5 = _mm_unpackhi_epi64(i5, t0);\
i6 = _mm_unpacklo_epi64(i6, t0);\
i7 = _mm_unpackhi_epi64(i7, t0);\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
i0 = _mm_unpacklo_epi64(i0, i1);\
i2 = _mm_unpacklo_epi64(i2, i3);\
i4 = _mm_unpacklo_epi64(i4, i5);\
i6 = _mm_unpacklo_epi64(i6, i7);\
}/**/
void INIT256(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
/* load IV into registers xmm12 - xmm15 */
xmm12 = chaining[0];
xmm13 = chaining[1];
xmm14 = chaining[2];
xmm15 = chaining[3];
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* store transposed IV */
chaining[0] = xmm12;
chaining[1] = xmm2;
chaining[2] = xmm6;
chaining[3] = xmm7;
}
void TF512(u64* h, u64* m)
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
#ifdef IACA_TRACE
IACA_START;
#endif
/* load message into registers xmm12 - xmm15 */
xmm12 = message[0];
xmm13 = message[1];
xmm14 = message[2];
xmm15 = message[3];
/* transform message M from column ordering into row ordering */
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* load previous chaining value */
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
xmm8 = chaining[0];
xmm0 = chaining[1];
xmm4 = chaining[2];
xmm5 = chaining[3];
/* xor message to CV get input of P */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
xmm8 = _mm_xor_si128(xmm8, xmm12);
xmm0 = _mm_xor_si128(xmm0, xmm2);
xmm4 = _mm_xor_si128(xmm4, xmm6);
xmm5 = _mm_xor_si128(xmm5, xmm7);
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, xmm8);
xmm1 = _mm_xor_si128(xmm1, xmm10);
xmm2 = _mm_xor_si128(xmm2, xmm12);
xmm3 = _mm_xor_si128(xmm3, xmm14);
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
/* store CV */
chaining[0] = xmm0;
chaining[1] = xmm1;
chaining[2] = xmm2;
chaining[3] = xmm3;
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
xmm8 = chaining[0];
xmm10 = chaining[1];
xmm12 = chaining[2];
xmm14 = chaining[3];
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
/* we only need to return the truncated half of the state */
chaining[2] = xmm9;
chaining[3] = xmm11;
}

View File

@@ -0,0 +1,482 @@
/* groestl-intr-avx.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
* instructions.
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
*
* This code is placed in the public domain
*/
#include <smmintrin.h>
#include <wmmintrin.h>
#include <immintrin.h>
#include "hash-groestl256.h"
/* global constants */
__m128i ROUND_CONST_Lx;
__m128i ROUND_CONST_L0[ROUNDS512];
__m128i ROUND_CONST_L7[ROUNDS512];
__m128i ROUND_CONST_P[ROUNDS1024];
__m128i ROUND_CONST_Q[ROUNDS1024];
__m128i TRANSP_MASK;
__m128i SUBSH_MASK[8];
__m128i ALL_FF;
//#if LENGTH <= 256
__m128i ALL_1B;
//#else
//__m256d ALL_1B;
//#endif
#define tos(a) #a
#define tostr(a) tos(a)
#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
#define SET_CONSTANTS(){\
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
for(i = 0; i < ROUNDS512; i++)\
{\
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
}\
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
}while(0);
/* xmm[i] will be multiplied by 2
* xmm[j] will be lost
* xmm[k] has to be all 0x1b
* xmm[z] has to be zero */
#define VMUL2(i, j, k, z){\
j = _mm_cmpgt_epi8(z, i);\
i = _mm_add_epi8(i, i);\
j = _mm_and_si128(j, k);\
i = _mm_xor_si128(i, j);\
}/**/
/* Yet another implementation of MixBytes.
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
Input: a0, ..., a7
Output: b0, ..., b7 = MixBytes(a0,...,a7).
but we use the relations:
t_i = a_i + a_{i+3}
x_i = t_i + t_{i+3}
y_i = t_i + t+{i+2} + a_{i+6}
z_i = 2*x_i
w_i = z_i + y_{i+4}
v_i = 2*w_i
b_i = v_{i+3} + y_{i+4}
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
and then adding v_i computed in the meantime in registers xmm0..xmm7.
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
b0 = a2;\
b1 = a3;\
b2 = a4;\
b3 = a5;\
b4 = a6;\
b5 = a7;\
b6 = a0;\
b7 = a1;\
\
/* t_i = a_i + a_{i+1} */\
a0 = _mm_xor_si128(a0, a1);\
a1 = _mm_xor_si128(a1, a2);\
a2 = _mm_xor_si128(a2, a3);\
a3 = _mm_xor_si128(a3, a4);\
a4 = _mm_xor_si128(a4, a5);\
a5 = _mm_xor_si128(a5, a6);\
a6 = _mm_xor_si128(a6, a7);\
a7 = _mm_xor_si128(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm_xor_si128(b0, a4);\
b1 = _mm_xor_si128(b1, a5);\
b2 = _mm_xor_si128(b2, a6);\
b3 = _mm_xor_si128(b3, a7);\
b4 = _mm_xor_si128(b4, a0);\
b5 = _mm_xor_si128(b5, a1);\
b6 = _mm_xor_si128(b6, a2);\
b7 = _mm_xor_si128(b7, a3);\
\
b0 = _mm_xor_si128(b0, a6);\
b1 = _mm_xor_si128(b1, a7);\
b2 = _mm_xor_si128(b2, a0);\
b3 = _mm_xor_si128(b3, a1);\
b4 = _mm_xor_si128(b4, a2);\
b5 = _mm_xor_si128(b5, a3);\
b6 = _mm_xor_si128(b6, a4);\
b7 = _mm_xor_si128(b7, a5);\
\
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
TEMP1 = b1;\
TEMP2 = b2;\
\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b1 = a1;\
TEMP3 = a2;\
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm_xor_si128(a0, a3);\
a1 = _mm_xor_si128(a1, a4);\
a2 = _mm_xor_si128(a2, a5);\
a3 = _mm_xor_si128(a3, a6);\
a4 = _mm_xor_si128(a4, a7);\
a5 = _mm_xor_si128(a5, b0);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, TEMP3);\
\
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
b1 = ALL_1B;\
b2 = _mm_xor_si128(b2, b2);\
VMUL2(a7, b0, b1, b2);\
VMUL2(a6, b0, b1, b2);\
VMUL2(a5, b0, b1, b2);\
VMUL2(a4, b0, b1, b2);\
VMUL2(a3, b0, b1, b2);\
VMUL2(a2, b0, b1, b2);\
VMUL2(a1, b0, b1, b2);\
VMUL2(a0, b0, b1, b2);\
\
/* compute w_i : add y_{i+4} */\
a0 = _mm_xor_si128(a0, TEMP0);\
a1 = _mm_xor_si128(a1, TEMP1);\
a2 = _mm_xor_si128(a2, TEMP2);\
a3 = _mm_xor_si128(a3, b3);\
a4 = _mm_xor_si128(a4, b4);\
a5 = _mm_xor_si128(a5, b5);\
a6 = _mm_xor_si128(a6, b6);\
a7 = _mm_xor_si128(a7, b7);\
\
/*compute v_i: double w_i */\
VMUL2(a0, b0, b1, b2);\
VMUL2(a1, b0, b1, b2);\
VMUL2(a2, b0, b1, b2);\
VMUL2(a3, b0, b1, b2);\
VMUL2(a4, b0, b1, b2);\
VMUL2(a5, b0, b1, b2);\
VMUL2(a6, b0, b1, b2);\
VMUL2(a7, b0, b1, b2);\
\
/* add to y_4 y_5 .. v3, v4, ... */\
b0 = _mm_xor_si128(a3, TEMP0);\
b1 = _mm_xor_si128(a4, TEMP1);\
b2 = _mm_xor_si128(a5, TEMP2);\
b3 = _mm_xor_si128(b3, a6);\
b4 = _mm_xor_si128(b4, a7);\
b5 = _mm_xor_si128(b5, a0);\
b6 = _mm_xor_si128(b6, a1);\
b7 = _mm_xor_si128(b7, a2);\
}/*MixBytes*/
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* Add Round Constant */\
b1 = ROUND_CONST_Lx;\
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
a1 = _mm_xor_si128(a1, b1);\
a2 = _mm_xor_si128(a2, b1);\
a3 = _mm_xor_si128(a3, b1);\
a4 = _mm_xor_si128(a4, b1);\
a5 = _mm_xor_si128(a5, b1);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
\
/* ShiftBytes + SubBytes (interleaved) */\
b0 = _mm_xor_si128(b0, b0);\
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
a0 = _mm_aesenclast_si128(a0, b0);\
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
a1 = _mm_aesenclast_si128(a1, b0);\
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
a2 = _mm_aesenclast_si128(a2, b0);\
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
a3 = _mm_aesenclast_si128(a3, b0);\
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
a4 = _mm_aesenclast_si128(a4, b0);\
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
a5 = _mm_aesenclast_si128(a5, b0);\
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
a6 = _mm_aesenclast_si128(a6, b0);\
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
a7 = _mm_aesenclast_si128(a7, b0);\
\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
}
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
t0 = TRANSP_MASK;\
\
i0 = _mm_shuffle_epi8(i0, t0);\
i1 = _mm_shuffle_epi8(i1, t0);\
i2 = _mm_shuffle_epi8(i2, t0);\
i3 = _mm_shuffle_epi8(i3, t0);\
\
o1 = _mm_unpackhi_epi16(i0, i1);\
i0 = _mm_unpacklo_epi16(i0, i1);\
t0 = _mm_unpackhi_epi16(i2, i3);\
i2 = _mm_unpacklo_epi16(i2, i3);\
\
i0 = _mm_shuffle_epi32(i0, 216);\
o1 = _mm_shuffle_epi32(o1, 216);\
i2 = _mm_shuffle_epi32(i2, 216);\
t0 = _mm_shuffle_epi32(t0, 216);\
\
o2 = _mm_unpackhi_epi32(i0, i2);\
o3 = _mm_unpackhi_epi32(o1, t0);\
i0 = _mm_unpacklo_epi32(i0, i2);\
o1 = _mm_unpacklo_epi32(o1, t0);\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
o1 = _mm_unpackhi_epi64(i0, i4);\
i0 = _mm_unpacklo_epi64(i0, i4);\
o2 = _mm_unpacklo_epi64(i1, i5);\
o3 = _mm_unpackhi_epi64(i1, i5);\
o4 = _mm_unpacklo_epi64(i2, i6);\
o5 = _mm_unpackhi_epi64(i2, i6);\
o6 = _mm_unpacklo_epi64(i3, i7);\
o7 = _mm_unpackhi_epi64(i3, i7);\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
o0 = _mm_unpackhi_epi64(i0, i1);\
i0 = _mm_unpacklo_epi64(i0, i1);\
o1 = _mm_unpackhi_epi64(i2, i3);\
i2 = _mm_unpacklo_epi64(i2, i3);\
o2 = _mm_unpackhi_epi64(i4, i5);\
i4 = _mm_unpacklo_epi64(i4, i5);\
o3 = _mm_unpackhi_epi64(i6, i7);\
i6 = _mm_unpacklo_epi64(i6, i7);\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
t0 = _mm_xor_si128(t0, t0);\
i1 = _mm_unpackhi_epi64(i0, t0);\
i0 = _mm_unpacklo_epi64(i0, t0);\
i3 = _mm_unpackhi_epi64(i2, t0);\
i2 = _mm_unpacklo_epi64(i2, t0);\
i5 = _mm_unpackhi_epi64(i4, t0);\
i4 = _mm_unpacklo_epi64(i4, t0);\
i7 = _mm_unpackhi_epi64(i6, t0);\
i6 = _mm_unpacklo_epi64(i6, t0);\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
i0 = _mm_unpacklo_epi64(i0, i1);\
i2 = _mm_unpacklo_epi64(i2, i3);\
i4 = _mm_unpacklo_epi64(i4, i5);\
i6 = _mm_unpacklo_epi64(i6, i7);\
}/**/
void INIT256(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
/* load IV into registers xmm12 - xmm15 */
xmm12 = chaining[0];
xmm13 = chaining[1];
xmm14 = chaining[2];
xmm15 = chaining[3];
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* store transposed IV */
chaining[0] = xmm12;
chaining[1] = xmm2;
chaining[2] = xmm6;
chaining[3] = xmm7;
}
void TF512(u64* h, u64* m)
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
static __m128i TEMP3;
#ifdef IACA_TRACE
IACA_START;
#endif
/* load message into registers xmm12 - xmm15 */
xmm12 = message[0];
xmm13 = message[1];
xmm14 = message[2];
xmm15 = message[3];
/* transform message M from column ordering into row ordering */
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* load previous chaining value and xor message to CV to get input of P */
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
xmm8 = _mm_xor_si128(xmm12, chaining[0]);
xmm0 = _mm_xor_si128(xmm2, chaining[1]);
xmm4 = _mm_xor_si128(xmm6, chaining[2]);
xmm5 = _mm_xor_si128(xmm7, chaining[3]);
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, xmm8);
xmm1 = _mm_xor_si128(xmm1, xmm10);
xmm2 = _mm_xor_si128(xmm2, xmm12);
xmm3 = _mm_xor_si128(xmm3, xmm14);
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, chaining[0]);
xmm1 = _mm_xor_si128(xmm1, chaining[1]);
xmm2 = _mm_xor_si128(xmm2, chaining[2]);
xmm3 = _mm_xor_si128(xmm3, chaining[3]);
/* store CV */
chaining[0] = xmm0;
chaining[1] = xmm1;
chaining[2] = xmm2;
chaining[3] = xmm3;
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
static __m128i TEMP3;
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
xmm8 = chaining[0];
xmm10 = chaining[1];
xmm12 = chaining[2];
xmm14 = chaining[3];
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
/* we only need to return the truncated half of the state */
chaining[2] = xmm9;
chaining[3] = xmm11;
}

View File

@@ -0,0 +1,793 @@
/* groestl-intr-vperm.h Aug 2011
*
* Groestl implementation with intrinsics using ssse3 instructions.
* Author: Günther A. Roland, Martin Schläffer
*
* Based on the vperm and aes_ni implementations of the hash function Groestl
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
* Institute of Applied Mathematics, Middle East Technical University, Turkey
*
* This code is placed in the public domain
*/
#include <tmmintrin.h>
#include "hash-groestl256.h"
/* global constants */
__m128i ROUND_CONST_Lx;
__m128i ROUND_CONST_L0[ROUNDS512];
__m128i ROUND_CONST_L7[ROUNDS512];
__m128i ROUND_CONST_P[ROUNDS1024];
__m128i ROUND_CONST_Q[ROUNDS1024];
__m128i TRANSP_MASK;
__m128i SUBSH_MASK[8];
__m128i ALL_0F;
__m128i ALL_15;
__m128i ALL_1B;
__m128i ALL_63;
__m128i ALL_FF;
__m128i VPERM_IPT[2];
__m128i VPERM_OPT[2];
__m128i VPERM_INV[2];
__m128i VPERM_SB1[2];
__m128i VPERM_SB2[2];
__m128i VPERM_SB4[2];
__m128i VPERM_SBO[2];
#define tos(a) #a
#define tostr(a) tos(a)
#define SET_SHARED_CONSTANTS(){\
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
}/**/
/* VPERM
* Transform w/o settings c*
* transforms 2 rows to/from "vperm mode"
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0, a1 = 2 rows
* table = transformation table to use
* t*, c* = clobbers
* outputs:
* a0, a1 = 2 rows transformed with table
* */
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
t0 = c0;\
t1 = c0;\
t0 = _mm_andnot_si128(t0, a0);\
t1 = _mm_andnot_si128(t1, a1);\
t0 = _mm_srli_epi32(t0, 4);\
t1 = _mm_srli_epi32(t1, 4);\
a0 = _mm_and_si128(a0, c0);\
a1 = _mm_and_si128(a1, c0);\
t2 = c2;\
t3 = c2;\
t2 = _mm_shuffle_epi8(t2, a0);\
t3 = _mm_shuffle_epi8(t3, a1);\
a0 = c1;\
a1 = c1;\
a0 = _mm_shuffle_epi8(a0, t0);\
a1 = _mm_shuffle_epi8(a1, t1);\
a0 = _mm_xor_si128(a0, t2);\
a1 = _mm_xor_si128(a1, t3);\
}/**/
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
c0 = ALL_0F;\
c1 = ((__m128i*) table )[0];\
c2 = ((__m128i*) table )[1];\
}/**/
/* VPERM
* Transform
* transforms 2 rows to/from "vperm mode"
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0, a1 = 2 rows
* table = transformation table to use
* t*, c* = clobbers
* outputs:
* a0, a1 = 2 rows transformed with table
* */
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
VPERM_Transform_Set_Const(table, c0, c1, c2);\
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
}/**/
/* VPERM
* Transform State
* inputs:
* a0-a3 = state
* table = transformation table to use
* t* = clobbers
* outputs:
* a0-a3 = transformed state
* */
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
VPERM_Transform_Set_Const(table, c0, c1, c2);\
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
}/**/
/* VPERM
* Add Constant to State
* inputs:
* a0-a7 = state
* constant = constant to add
* t0 = clobber
* outputs:
* a0-a7 = state + constant
* */
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
t0 = constant;\
a0 = _mm_xor_si128(a0, t0);\
a1 = _mm_xor_si128(a1, t0);\
a2 = _mm_xor_si128(a2, t0);\
a3 = _mm_xor_si128(a3, t0);\
a4 = _mm_xor_si128(a4, t0);\
a5 = _mm_xor_si128(a5, t0);\
a6 = _mm_xor_si128(a6, t0);\
a7 = _mm_xor_si128(a7, t0);\
}/**/
/* VPERM
* Set Substitute Core Constants
* */
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
}/**/
/* VPERM
* Substitute Core
* first part of sbox inverse computation
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0 = 1 row
* t*, c* = clobbers
* outputs:
* b0a, b0b = inputs for lookup step
* */
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
t0 = c0;\
t0 = _mm_andnot_si128(t0, a0);\
t0 = _mm_srli_epi32(t0, 4);\
a0 = _mm_and_si128(a0, c0);\
b0a = c1;\
b0a = _mm_shuffle_epi8(b0a, a0);\
a0 = _mm_xor_si128(a0, t0);\
b0b = c2;\
b0b = _mm_shuffle_epi8(b0b, t0);\
b0b = _mm_xor_si128(b0b, b0a);\
t1 = c2;\
t1 = _mm_shuffle_epi8(t1, a0);\
t1 = _mm_xor_si128(t1, b0a);\
b0a = c2;\
b0a = _mm_shuffle_epi8(b0a, b0b);\
b0a = _mm_xor_si128(b0a, a0);\
b0b = c2;\
b0b = _mm_shuffle_epi8(b0b, t1);\
b0b = _mm_xor_si128(b0b, t0);\
}/**/
/* VPERM
* Lookup
* second part of sbox inverse computation
* this function is derived from:
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0a, a0b = output of Substitution Core
* table = lookup table to use (*1 / *2 / *4)
* t0 = clobber
* outputs:
* b0 = output of sbox + multiplication
* */
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
b0 = ((__m128i*) table )[0];\
t0 = ((__m128i*) table )[1];\
b0 = _mm_shuffle_epi8(b0, a0b);\
t0 = _mm_shuffle_epi8(t0, a0a);\
b0 = _mm_xor_si128(b0, t0);\
}/**/
/* VPERM
* SubBytes and *2 / *4
* this function is derived from:
* Constant-time SSSE3 AES core implementation
* by Mike Hamburg
* and
* vperm and aes_ni implementations of hash function Grostl
* by Cagdas CALIK
* inputs:
* a0-a7 = state
* t*, c* = clobbers
* outputs:
* a0-a7 = state * 4
* c2 = row0 * 2 -> b0
* c1 = row7 * 2 -> b3
* c0 = row7 * 1 -> b4
* t2 = row4 * 1 -> b7
* TEMP_MUL1 = row(i) * 1
* TEMP_MUL2 = row(i) * 2
*
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
/* set Constants */\
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
/* row 1 */\
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
TEMP_MUL1[1] = t2;\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
TEMP_MUL2[1] = t3;\
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
/* --- */\
/* row 2 */\
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
TEMP_MUL1[2] = t2;\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
TEMP_MUL2[2] = t3;\
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
/* --- */\
/* row 3 */\
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
TEMP_MUL1[3] = t2;\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
TEMP_MUL2[3] = t3;\
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
/* --- */\
/* row 5 */\
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
TEMP_MUL1[5] = t2;\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
TEMP_MUL2[5] = t3;\
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
/* --- */\
/* row 6 */\
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
TEMP_MUL1[6] = t2;\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
TEMP_MUL2[6] = t3;\
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
/* --- */\
/* row 7 */\
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
TEMP_MUL1[7] = t2;\
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
/* --- */\
/* row 4 */\
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
TEMP_MUL2[4] = t3;\
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
/* --- */\
/* row 0 */\
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
TEMP_MUL2[0] = c2;\
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
/* --- */\
}/**/
/* Optimized MixBytes
* inputs:
* a0-a7 = (row0-row7) * 4
* b0 = row0 * 2
* b3 = row7 * 2
* b4 = row7 * 1
* b7 = row4 * 1
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
* output: b0-b7
* */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* save one value */\
TEMP_MUL4 = a3;\
/* 1 */\
b1 = a0;\
b1 = _mm_xor_si128(b1, a5);\
b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
b2 = b1;\
\
/* 2 */\
b5 = a1;\
b5 = _mm_xor_si128(b5, a4);\
b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
b6 = b5;\
\
/* 4 */\
b7 = _mm_xor_si128(b7, a6);\
/*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
b2 = _mm_xor_si128(b2, b7);\
\
/* 3 */\
b0 = _mm_xor_si128(b0, a7);\
b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
/*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
b3 = b0;\
b1 = _mm_xor_si128(b1, b0);\
b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
\
/* 5 */\
b4 = _mm_xor_si128(b4, a2);\
/*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
b3 = _mm_xor_si128(b3, b4);\
b6 = _mm_xor_si128(b6, b4);\
\
/* 6 */\
a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
b4 = _mm_xor_si128(b4, a3);\
b5 = _mm_xor_si128(b5, a3);\
b7 = _mm_xor_si128(b7, a3);\
\
/* 7 */\
a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
b2 = _mm_xor_si128(b2, a1);\
b3 = _mm_xor_si128(b3, a1);\
\
/* 8 */\
a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
b6 = _mm_xor_si128(b6, a5);\
b7 = _mm_xor_si128(b7, a5);\
\
/* 9 */\
a3 = TEMP_MUL1[2];\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
b0 = _mm_xor_si128(b0, a3);\
b5 = _mm_xor_si128(b5, a3);\
\
/* 10 */\
a1 = TEMP_MUL1[6];\
a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
b1 = _mm_xor_si128(b1, a1);\
b4 = _mm_xor_si128(b4, a1);\
\
/* 11 */\
a5 = TEMP_MUL1[3];\
a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
b1 = _mm_xor_si128(b1, a5);\
b6 = _mm_xor_si128(b6, a5);\
\
/* 12 */\
a3 = TEMP_MUL1[7];\
a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
b2 = _mm_xor_si128(b2, a3);\
b5 = _mm_xor_si128(b5, a3);\
\
/* 13 */\
b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
b0 = _mm_xor_si128(b0, a4);\
b1 = _mm_xor_si128(b1, a4);\
b3 = _mm_xor_si128(b3, a6);\
b4 = _mm_xor_si128(b4, a0);\
b4 = _mm_xor_si128(b4, a7);\
b5 = _mm_xor_si128(b5, a0);\
b7 = _mm_xor_si128(b7, a2);\
}/**/
#define SET_CONSTANTS(){\
SET_SHARED_CONSTANTS();\
SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
for(i = 0; i < ROUNDS512; i++)\
{\
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
}\
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
}/**/
/* vperm:
* transformation before rounds with ipt
* first round add transformed constant
* middle rounds: add constant XOR 0x15...15
* last round: additionally add 0x15...15 after MB
* transformation after rounds with opt
*/
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant + ShiftBytes (interleaved) */\
b1 = ROUND_CONST_Lx;\
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
a1 = _mm_xor_si128(a1, b1);\
a2 = _mm_xor_si128(a2, b1);\
a3 = _mm_xor_si128(a3, b1);\
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
a4 = _mm_xor_si128(a4, b1);\
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
a5 = _mm_xor_si128(a5, b1);\
a6 = _mm_xor_si128(a6, b1);\
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
/* SubBytes + Multiplication by 2 and 4 */\
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
}/**/
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
t0 = TRANSP_MASK;\
\
i0 = _mm_shuffle_epi8(i0, t0);\
i1 = _mm_shuffle_epi8(i1, t0);\
i2 = _mm_shuffle_epi8(i2, t0);\
i3 = _mm_shuffle_epi8(i3, t0);\
\
o1 = i0;\
t0 = i2;\
\
i0 = _mm_unpacklo_epi16(i0, i1);\
o1 = _mm_unpackhi_epi16(o1, i1);\
i2 = _mm_unpacklo_epi16(i2, i3);\
t0 = _mm_unpackhi_epi16(t0, i3);\
\
i0 = _mm_shuffle_epi32(i0, 216);\
o1 = _mm_shuffle_epi32(o1, 216);\
i2 = _mm_shuffle_epi32(i2, 216);\
t0 = _mm_shuffle_epi32(t0, 216);\
\
o2 = i0;\
o3 = o1;\
\
i0 = _mm_unpacklo_epi32(i0, i2);\
o1 = _mm_unpacklo_epi32(o1, t0);\
o2 = _mm_unpackhi_epi32(o2, i2);\
o3 = _mm_unpackhi_epi32(o3, t0);\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
o1 = i0;\
o2 = i1;\
i0 = _mm_unpacklo_epi64(i0, i4);\
o1 = _mm_unpackhi_epi64(o1, i4);\
o3 = i1;\
o4 = i2;\
o2 = _mm_unpacklo_epi64(o2, i5);\
o3 = _mm_unpackhi_epi64(o3, i5);\
o5 = i2;\
o6 = i3;\
o4 = _mm_unpacklo_epi64(o4, i6);\
o5 = _mm_unpackhi_epi64(o5, i6);\
o7 = i3;\
o6 = _mm_unpacklo_epi64(o6, i7);\
o7 = _mm_unpackhi_epi64(o7, i7);\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
o0 = i0;\
i0 = _mm_unpacklo_epi64(i0, i1);\
o0 = _mm_unpackhi_epi64(o0, i1);\
o1 = i2;\
i2 = _mm_unpacklo_epi64(i2, i3);\
o1 = _mm_unpackhi_epi64(o1, i3);\
o2 = i4;\
i4 = _mm_unpacklo_epi64(i4, i5);\
o2 = _mm_unpackhi_epi64(o2, i5);\
o3 = i6;\
i6 = _mm_unpacklo_epi64(i6, i7);\
o3 = _mm_unpackhi_epi64(o3, i7);\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
t0 = _mm_xor_si128(t0, t0);\
i1 = i0;\
i3 = i2;\
i5 = i4;\
i7 = i6;\
i0 = _mm_unpacklo_epi64(i0, t0);\
i1 = _mm_unpackhi_epi64(i1, t0);\
i2 = _mm_unpacklo_epi64(i2, t0);\
i3 = _mm_unpackhi_epi64(i3, t0);\
i4 = _mm_unpacklo_epi64(i4, t0);\
i5 = _mm_unpackhi_epi64(i5, t0);\
i6 = _mm_unpacklo_epi64(i6, t0);\
i7 = _mm_unpackhi_epi64(i7, t0);\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
i0 = _mm_unpacklo_epi64(i0, i1);\
i2 = _mm_unpacklo_epi64(i2, i3);\
i4 = _mm_unpacklo_epi64(i4, i5);\
i6 = _mm_unpacklo_epi64(i6, i7);\
}/**/
/* transform round constants into VPERM mode */
#define VPERM_Transform_RoundConst_CNT2(i, j){\
xmm0 = ROUND_CONST_L0[i];\
xmm1 = ROUND_CONST_L7[i];\
xmm2 = ROUND_CONST_L0[j];\
xmm3 = ROUND_CONST_L7[j];\
VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
ROUND_CONST_L0[i] = xmm0;\
ROUND_CONST_L7[i] = xmm1;\
ROUND_CONST_L0[j] = xmm2;\
ROUND_CONST_L7[j] = xmm3;\
}/**/
/* transform round constants into VPERM mode */
#define VPERM_Transform_RoundConst(){\
xmm0 = ROUND_CONST_Lx;\
VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
ROUND_CONST_Lx = xmm0;\
VPERM_Transform_RoundConst_CNT2(0, 1);\
VPERM_Transform_RoundConst_CNT2(2, 3);\
VPERM_Transform_RoundConst_CNT2(4, 5);\
VPERM_Transform_RoundConst_CNT2(6, 7);\
VPERM_Transform_RoundConst_CNT2(8, 9);\
}/**/
void INIT256(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
/* transform round constants into VPERM mode */
VPERM_Transform_RoundConst();
/* load IV into registers xmm12 - xmm15 */
xmm12 = chaining[0];
xmm13 = chaining[1];
xmm14 = chaining[2];
xmm15 = chaining[3];
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* store transposed IV */
chaining[0] = xmm12;
chaining[1] = xmm2;
chaining[2] = xmm6;
chaining[3] = xmm7;
}
void TF512(u64* h, u64* m)
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP_MUL1[8];
static __m128i TEMP_MUL2[8];
static __m128i TEMP_MUL4;
#ifdef IACA_TRACE
IACA_START;
#endif
/* load message into registers xmm12 - xmm15 */
xmm12 = message[0];
xmm13 = message[1];
xmm14 = message[2];
xmm15 = message[3];
/* transform message M from column ordering into row ordering */
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* load previous chaining value */
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
xmm8 = chaining[0];
xmm0 = chaining[1];
xmm4 = chaining[2];
xmm5 = chaining[3];
/* xor message to CV get input of P */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
xmm8 = _mm_xor_si128(xmm8, xmm12);
xmm0 = _mm_xor_si128(xmm0, xmm2);
xmm4 = _mm_xor_si128(xmm4, xmm6);
xmm5 = _mm_xor_si128(xmm5, xmm7);
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, xmm8);
xmm1 = _mm_xor_si128(xmm1, xmm10);
xmm2 = _mm_xor_si128(xmm2, xmm12);
xmm3 = _mm_xor_si128(xmm3, xmm14);
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
/* store CV */
chaining[0] = xmm0;
chaining[1] = xmm1;
chaining[2] = xmm2;
chaining[3] = xmm3;
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP_MUL1[8];
static __m128i TEMP_MUL2[8];
static __m128i TEMP_MUL4;
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
xmm8 = chaining[0];
xmm10 = chaining[1];
xmm12 = chaining[2];
xmm14 = chaining[3];
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
/* we only need to return the truncated half of the state */
chaining[2] = xmm9;
chaining[3] = xmm11;
return;
}//OF512()

View File

@@ -0,0 +1,306 @@
/* hash.c Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#include "hash-groestl.h"
#include "miner.h"
#ifndef NO_AES_NI
#include "groestl-version.h"
#ifdef TASM
#ifdef VAES
#include "groestl-asm-aes.h"
#else
#ifdef VAVX
#include "groestl-asm-avx.h"
#else
#ifdef VVPERM
#include "groestl-asm-vperm.h"
#else
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
#endif
#endif
#endif
#else
#ifdef TINTR
#ifdef VAES
#include "groestl-intr-aes.h"
#else
#ifdef VAVX
#include "groestl-intr-avx.h"
#else
#ifdef VVPERM
#include "groestl-intr-vperm.h"
#else
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
#endif
#endif
#endif
#else
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
#endif
#endif
/* digest up to len bytes of input (full blocks only) */
void Transform(hashState_groestl *ctx,
const u8 *in,
unsigned long long len) {
/* increment block counter */
ctx->block_counter += len/SIZE;
/* digest message, one block at a time */
for (; len >= SIZE; len -= SIZE, in += SIZE)
#if LENGTH<=256
TF512((u64*)ctx->chaining, (u64*)in);
#else
TF1024((u64*)ctx->chaining, (u64*)in);
#endif
asm volatile ("emms");
}
/* given state h, do h <- P(h)+h */
void OutputTransformation(hashState_groestl *ctx) {
/* determine variant */
#if (LENGTH <= 256)
OF512((u64*)ctx->chaining);
#else
OF1024((u64*)ctx->chaining);
#endif
asm volatile ("emms");
}
/* initialise context */
HashReturn_gr init_groestl(hashState_groestl* ctx) {
u8 i = 0;
/* output size (in bits) must be a positive integer less than or
equal to 512, and divisible by 8 */
if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512)
return BAD_HASHBITLEN_GR;
/* set number of state columns and state size depending on
variant */
ctx->columns = COLS;
ctx->statesize = SIZE;
#if (LENGTH <= 256)
ctx->v = SHoRT;
#else
ctx->v = LoNG;
#endif
SET_CONSTANTS();
for (i=0; i<SIZE/8; i++)
ctx->chaining[i] = 0;
for (i=0; i<SIZE; i++)
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
INIT(ctx->chaining);
/* set other variables */
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->bits_in_last_byte = 0;
return SUCCESS_GR;
}
HashReturn_gr reinit_groestl(hashState_groestl* ctx)
{
int i;
for (i=0; i<SIZE/8; i++)
ctx->chaining[i] = 0;
for (i=0; i<SIZE; i++)
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
INIT(ctx->chaining);
/* set other variables */
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->bits_in_last_byte = 0;
return SUCCESS_GR;
}
/* update state with databitlen bits of input */
HashReturn_gr update_groestl(hashState_groestl* ctx,
const BitSequence_gr* input,
DataLength_gr databitlen) {
int index = 0;
int msglen = (int)(databitlen/8);
int rem = (int)(databitlen%8);
/* non-integral number of message bytes can only be supplied in the
last call to this function */
if (ctx->bits_in_last_byte) return FAIL_GR;
/* if the buffer contains data that has not yet been digested, first
add data to buffer until full */
// The following block of code never gets hit when hashing x11 or quark
// leave it here in case it might be needed.
// if (ctx->buf_ptr)
// {
// while (ctx->buf_ptr < ctx->statesize && index < msglen)
// {
// ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
// }
// if (ctx->buf_ptr < ctx->statesize)
// {
// /* buffer still not full, return */
// if (rem)
// {
// ctx->bits_in_last_byte = rem;
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
// }
// return SUCCESS_GR;
// }
// /* digest buffer */
// ctx->buf_ptr = 0;
// printf("error\n");
// Transform(ctx, ctx->buffer, ctx->statesize);
// end dead code
// }
/* digest bulk of message */
Transform(ctx, input+index, msglen-index);
index += ((msglen-index)/ctx->statesize)*ctx->statesize;
/* store remaining data in buffer */
while (index < msglen)
{
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
// Another block that doesn't get used by x11 or quark
// /* if non-integral number of bytes have been supplied, store
// remaining bits in last byte, together with information about
// number of bits */
// if (rem)
// {
// ctx->bits_in_last_byte = rem;
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
// }
return SUCCESS_GR;
}
#define BILB ctx->bits_in_last_byte
/* finalise: process remaining data (including padding), perform
output transformation, and write hash result to 'output' */
HashReturn_gr final_groestl(hashState_groestl* ctx,
BitSequence_gr* output) {
int i, j = 0, hashbytelen = LENGTH/8;
u8 *s = (BitSequence_gr*)ctx->chaining;
/* pad with '1'-bit and first few '0'-bits */
if (BILB) {
ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
BILB = 0;
}
else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
/* pad with '0'-bits */
if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
/* padding requires two blocks */
while (ctx->buf_ptr < ctx->statesize) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
/* digest first padding block */
Transform(ctx, ctx->buffer, ctx->statesize);
ctx->buf_ptr = 0;
}
while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
/* length padding */
ctx->block_counter++;
ctx->buf_ptr = ctx->statesize;
while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
ctx->block_counter >>= 8;
}
/* digest final padding block */
Transform(ctx, ctx->buffer, ctx->statesize);
/* perform output transformation */
OutputTransformation(ctx);
/* store hash result in output */
for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) {
output[j] = s[i];
}
/* zeroise relevant variables and deallocate memory */
for (i = 0; i < ctx->columns; i++) {
ctx->chaining[i] = 0;
}
for (i = 0; i < ctx->statesize; i++) {
ctx->buffer[i] = 0;
}
// free(ctx->chaining);
// free(ctx->buffer);
return SUCCESS_GR;
}
/* hash bit sequence */
HashReturn_gr hash_groestl(int hashbitlen,
const BitSequence_gr* data,
DataLength_gr databitlen,
BitSequence_gr* hashval) {
HashReturn_gr ret;
hashState_groestl context;
/* initialise */
if ((ret = init_groestl(&context)) != SUCCESS_GR)
return ret;
/* process message */
if ((ret = update_groestl(&context, data, databitlen)) != SUCCESS_GR)
return ret;
/* finalise */
ret = final_groestl(&context, hashval);
return ret;
}
/* eBash API */
#ifdef crypto_hash_BYTES
int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
{
if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
return -1;
}
#endif
#endif

View File

@@ -0,0 +1,110 @@
/* hash.h Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#ifndef __hash_h
#define __hash_h
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)
#include <windows.h>
#endif
#include <stdlib.h>
/* eBash API begin */
/*
#include "crypto_hash.h"
#ifdef crypto_hash_BYTES
#include <crypto_uint8.h>
#include <crypto_uint32.h>
#include <crypto_uint64.h>
typedef crypto_uint8 u8;
typedef crypto_uint32 u32;
typedef crypto_uint64 u64;
#endif
* /
/* eBash API end */
#define LENGTH (512)
#include "brg_endian.h"
#define NEED_UINT_64T
#include "brg_types.h"
#ifdef IACA_TRACE
#include IACA_MARKS
#endif
#ifndef LENGTH
#define LENGTH (256)
#endif
/* some sizes (number of bytes) */
#define ROWS (8)
#define LENGTHFIELDLEN (ROWS)
#define COLS512 (8)
#define COLS1024 (16)
#define SIZE512 ((ROWS)*(COLS512))
#define SIZE1024 ((ROWS)*(COLS1024))
#define ROUNDS512 (10)
#define ROUNDS1024 (14)
#if LENGTH<=256
#define COLS (COLS512)
#define SIZE (SIZE512)
#define ROUNDS (ROUNDS512)
#else
#define COLS (COLS1024)
#define SIZE (SIZE1024)
#define ROUNDS (ROUNDS1024)
#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
#define U64BIG(a) (a)
#endif /* IS_BIG_ENDIAN */
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
#define U64BIG(a) \
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
(ROTL64(a,56) & li_64(FF000000FF000000)))
#endif /* IS_LITTLE_ENDIAN */
typedef enum { LoNG, SHoRT } Var;
/* NIST API begin */
typedef unsigned char BitSequence_gr;
typedef unsigned long long DataLength_gr;
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
typedef struct {
__attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
__attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
u64 block_counter; /* message block counter */
int buf_ptr; /* data buffer pointer */
int bits_in_last_byte; /* no. of message bits in last byte of
data buffer */
int columns; /* no. of columns in state */
int statesize; /* total no. of bytes in state */
Var v; /* LONG or SHORT */
} hashState_groestl;
HashReturn_gr init_groestl(hashState_groestl*);
HashReturn_gr reinit_groestl(hashState_groestl*);
HashReturn_gr update_groestl(hashState_groestl*, const BitSequence_gr*, DataLength_gr);
HashReturn_gr final_groestl(hashState_groestl*, BitSequence_gr*);
HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*);
/* NIST API end */
#endif /* __hash_h */

View File

@@ -0,0 +1,309 @@
/* hash.c Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#include "hash-groestl256.h"
#include "miner.h"
#ifndef NO_AES_NI
#include "groestl-version.h"
#ifdef TASM
#ifdef VAES
#include "groestl256-asm-aes.h"
#else
#ifdef VAVX
#include "groestl256-asm-avx.h"
#else
#ifdef VVPERM
#include "groestl256-asm-vperm.h"
#else
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
#endif
#endif
#endif
#else
#ifdef TINTR
#ifdef VAES
#include "groestl256-intr-aes.h"
#else
#ifdef VAVX
#include "groestl256-intr-avx.h"
#else
#ifdef VVPERM
#include "groestl256-intr-vperm.h"
#else
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
#endif
#endif
#endif
#else
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
#endif
#endif
/* digest up to len bytes of input (full blocks only) */
void Transform256(hashState_groestl256 *ctx,
const u8 *in,
unsigned long long len) {
/* increment block counter */
ctx->block_counter += len/SIZE;
/* digest message, one block at a time */
for (; len >= SIZE; len -= SIZE, in += SIZE)
//#if LENGTH<=256
TF512((u64*)ctx->chaining, (u64*)in);
//#else
// TF1024((u64*)ctx->chaining, (u64*)in);
//#endif
asm volatile ("emms");
}
/* given state h, do h <- P(h)+h */
void OutputTransformation256(hashState_groestl256 *ctx) {
/* determine variant */
//#if (LENGTH <= 256)
OF512((u64*)ctx->chaining);
//#else
// OF1024((u64*)ctx->chaining);
//#endif
asm volatile ("emms");
}
/* initialise context */
HashReturn_gr init_groestl256(hashState_groestl256* ctx) {
u8 i = 0;
/* output size (in bits) must be a positive integer less than or
equal to 512, and divisible by 8 */
// if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512)
// return BAD_HASHBITLEN_GR;
/* set number of state columns and state size depending on
variant */
ctx->columns = COLS;
ctx->statesize = SIZE;
//#if (LENGTH <= 256)
ctx->v = SHoRT;
//#else
// ctx->v = LoNG;
//#endif
SET_CONSTANTS();
for (i=0; i<SIZE/8; i++)
ctx->chaining[i] = 0;
for (i=0; i<SIZE; i++)
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
// ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
ctx->chaining[ctx->columns-1] = U64BIG((u64)256);
INIT256(ctx->chaining);
/* set other variables */
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->bits_in_last_byte = 0;
return SUCCESS_GR;
}
HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
{
int i;
for (i=0; i<SIZE/8; i++)
ctx->chaining[i] = 0;
for (i=0; i<SIZE; i++)
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
// ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
ctx->chaining[ctx->columns-1] = 256;
INIT256(ctx->chaining);
/* set other variables */
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->bits_in_last_byte = 0;
return SUCCESS_GR;
}
/* update state with databitlen bits of input */
HashReturn_gr update_groestl256(hashState_groestl256* ctx,
const BitSequence_gr* input,
DataLength_gr databitlen) {
int index = 0;
int msglen = (int)(databitlen/8);
int rem = (int)(databitlen%8);
/* non-integral number of message bytes can only be supplied in the
last call to this function */
if (ctx->bits_in_last_byte) return FAIL_GR;
/* if the buffer contains data that has not yet been digested, first
add data to buffer until full */
// The following block of code never gets hit when hashing x11 or quark
// leave it here in case it might be needed.
// if (ctx->buf_ptr)
// {
// while (ctx->buf_ptr < ctx->statesize && index < msglen)
// {
// ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
// }
// if (ctx->buf_ptr < ctx->statesize)
// {
// /* buffer still not full, return */
// if (rem)
// {
// ctx->bits_in_last_byte = rem;
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
// }
// return SUCCESS_GR;
// }
// /* digest buffer */
// ctx->buf_ptr = 0;
// printf("error\n");
// Transform(ctx, ctx->buffer, ctx->statesize);
// end dead code
// }
/* digest bulk of message */
Transform256(ctx, input+index, msglen-index);
index += ((msglen-index)/ctx->statesize)*ctx->statesize;
/* store remaining data in buffer */
while (index < msglen)
{
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
}
// Another block that doesn't get used by x11 or quark
// /* if non-integral number of bytes have been supplied, store
// remaining bits in last byte, together with information about
// number of bits */
// if (rem)
// {
// ctx->bits_in_last_byte = rem;
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
// }
return SUCCESS_GR;
}
#define BILB ctx->bits_in_last_byte
/* finalise: process remaining data (including padding), perform
output transformation, and write hash result to 'output' */
HashReturn_gr final_groestl256(hashState_groestl256* ctx,
BitSequence_gr* output) {
// int i, j = 0, hashbytelen = LENGTH/8;
int i, j = 0, hashbytelen = 256/8;
u8 *s = (BitSequence_gr*)ctx->chaining;
/* pad with '1'-bit and first few '0'-bits */
if (BILB) {
ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
BILB = 0;
}
else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
/* pad with '0'-bits */
if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
/* padding requires two blocks */
while (ctx->buf_ptr < ctx->statesize) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
/* digest first padding block */
Transform256(ctx, ctx->buffer, ctx->statesize);
ctx->buf_ptr = 0;
}
while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) {
ctx->buffer[(int)ctx->buf_ptr++] = 0;
}
/* length padding */
ctx->block_counter++;
ctx->buf_ptr = ctx->statesize;
while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
ctx->block_counter >>= 8;
}
/* digest final padding block */
Transform256(ctx, ctx->buffer, ctx->statesize);
/* perform output transformation */
OutputTransformation256(ctx);
/* store hash result in output */
for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) {
output[j] = s[i];
}
/* zeroise relevant variables and deallocate memory */
for (i = 0; i < ctx->columns; i++) {
ctx->chaining[i] = 0;
}
for (i = 0; i < ctx->statesize; i++) {
ctx->buffer[i] = 0;
}
// free(ctx->chaining);
// free(ctx->buffer);
return SUCCESS_GR;
}
/* hash bit sequence */
HashReturn_gr hash_groestl256(int hashbitlen,
const BitSequence_gr* data,
DataLength_gr databitlen,
BitSequence_gr* hashval) {
HashReturn_gr ret;
hashState_groestl256 context;
/* initialise */
if ((ret = init_groestl256(&context)) != SUCCESS_GR)
return ret;
/* process message */
if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
return ret;
/* finalise */
ret = final_groestl256(&context, hashval);
return ret;
}
/* eBash API */
//#ifdef crypto_hash_BYTES
//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
//{
// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
// return -1;
//}
//#endif
#endif

View File

@@ -0,0 +1,110 @@
/* hash.h Aug 2011
*
* Groestl implementation for different versions.
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
*
* This code is placed in the public domain
*/
#ifndef __hash_h
#define __hash_h
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)
#include <windows.h>
#endif
#include <stdlib.h>
/* eBash API begin */
/*
#include "crypto_hash.h"
#ifdef crypto_hash_BYTES
#include <crypto_uint8.h>
#include <crypto_uint32.h>
#include <crypto_uint64.h>
typedef crypto_uint8 u8;
typedef crypto_uint32 u32;
typedef crypto_uint64 u64;
#endif
*/
/* eBash API end */
//#define LENGTH (512)
#include "brg_endian.h"
#define NEED_UINT_64T
#include "brg_types.h"
#ifdef IACA_TRACE
#include IACA_MARKS
#endif
//#ifndef LENGTH
//#define LENGTH (256)
//#endif
/* some sizes (number of bytes) */
#define ROWS (8)
#define LENGTHFIELDLEN (ROWS)
#define COLS512 (8)
#define COLS1024 (16)
#define SIZE512 ((ROWS)*(COLS512))
#define SIZE1024 ((ROWS)*(COLS1024))
#define ROUNDS512 (10)
#define ROUNDS1024 (14)
//#if LENGTH<=256
#define COLS (COLS512)
#define SIZE (SIZE512)
#define ROUNDS (ROUNDS512)
//#else
//#define COLS (COLS1024)
//#define SIZE (SIZE1024)
//#define ROUNDS (ROUNDS1024)
//#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
#define U64BIG(a) (a)
#endif /* IS_BIG_ENDIAN */
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
#define U64BIG(a) \
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
(ROTL64(a,56) & li_64(FF000000FF000000)))
#endif /* IS_LITTLE_ENDIAN */
typedef enum { LoNG, SHoRT } Var;
/* NIST API begin */
typedef unsigned char BitSequence_gr;
typedef unsigned long long DataLength_gr;
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
typedef struct {
__attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
__attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
u64 block_counter; /* message block counter */
int buf_ptr; /* data buffer pointer */
int bits_in_last_byte; /* no. of message bits in last byte of
data buffer */
int columns; /* no. of columns in state */
int statesize; /* total no. of bytes in state */
Var v; /* LONG or SHORT */
} hashState_groestl256;
HashReturn_gr init_groestl(hashState_groestl256*);
HashReturn_gr reinit_groestl(hashState_groestl256*);
HashReturn_gr update_groestl(hashState_groestl256*, const BitSequence_gr*, DataLength_gr);
HashReturn_gr final_groestl(hashState_groestl256*, BitSequence_gr*);
HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*);
/* NIST API end */
#endif /* __hash_h */

View File

@@ -0,0 +1,3 @@
Krystian Matusiewicz
Günther A. Roland
Martin Schläffer