mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Initial upload v3.4.7
This commit is contained in:
0
algo/groestl/aes_ni/.dirstamp
Normal file
0
algo/groestl/aes_ni/.dirstamp
Normal file
14
algo/groestl/aes_ni/README
Normal file
14
algo/groestl/aes_ni/README
Normal file
@@ -0,0 +1,14 @@
|
||||
This package contains an implementation of the Groestl-512 hash
|
||||
function optimized for the Intel AES instructions.
|
||||
|
||||
Authors are Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
|
||||
There are no known present or future claims by a copyright holder that
|
||||
the distribution of this software infringes the copyright. In
|
||||
particular, the author of the software is not making such claims and
|
||||
does not intend to make such claims.
|
||||
|
||||
Moreover, there are no known present or future claims by a patent
|
||||
holder that the use of this software infringes the patent. In
|
||||
particular, the author of the software is not making such claims and
|
||||
does not intend to make such claims.
|
||||
2
algo/groestl/aes_ni/api.h
Normal file
2
algo/groestl/aes_ni/api.h
Normal file
@@ -0,0 +1,2 @@
|
||||
#define CRYPTO_BYTES 64
|
||||
#define CRYPTO_VERSION "2.2"
|
||||
1
algo/groestl/aes_ni/architectures
Normal file
1
algo/groestl/aes_ni/architectures
Normal file
@@ -0,0 +1 @@
|
||||
amd64
|
||||
133
algo/groestl/aes_ni/brg_endian.h
Normal file
133
algo/groestl/aes_ni/brg_endian.h
Normal file
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
---------------------------------------------------------------------------
|
||||
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
|
||||
|
||||
LICENSE TERMS
|
||||
|
||||
The redistribution and use of this software (with or without changes)
|
||||
is allowed without the payment of fees or royalties provided that:
|
||||
|
||||
1. source code distributions include the above copyright notice, this
|
||||
list of conditions and the following disclaimer;
|
||||
|
||||
2. binary distributions include the above copyright notice, this list
|
||||
of conditions and the following disclaimer in their documentation;
|
||||
|
||||
3. the name of the copyright holder is not used to endorse products
|
||||
built using this software without specific written permission.
|
||||
|
||||
DISCLAIMER
|
||||
|
||||
This software is provided 'as is' with no explicit or implied warranties
|
||||
in respect of its properties, including, but not limited to, correctness
|
||||
and/or fitness for purpose.
|
||||
---------------------------------------------------------------------------
|
||||
Issue Date: 20/12/2007
|
||||
*/
|
||||
|
||||
#ifndef _BRG_ENDIAN_H
|
||||
#define _BRG_ENDIAN_H
|
||||
|
||||
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
|
||||
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
|
||||
|
||||
/* Include files where endian defines and byteswap functions may reside */
|
||||
#if defined( __sun )
|
||||
# include <sys/isa_defs.h>
|
||||
#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
|
||||
# include <sys/endian.h>
|
||||
#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
|
||||
defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
|
||||
# include <machine/endian.h>
|
||||
#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
|
||||
# if !defined( __MINGW32__ ) && !defined( _AIX )
|
||||
# include <endian.h>
|
||||
# if !defined( __BEOS__ )
|
||||
# include <byteswap.h>
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Now attempt to set the define for platform byte order using any */
|
||||
/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */
|
||||
/* seem to encompass most endian symbol definitions */
|
||||
|
||||
#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
|
||||
# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
# endif
|
||||
#elif defined( BIG_ENDIAN )
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
#elif defined( LITTLE_ENDIAN )
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
#endif
|
||||
|
||||
#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
|
||||
# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
# endif
|
||||
#elif defined( _BIG_ENDIAN )
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
#elif defined( _LITTLE_ENDIAN )
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
#endif
|
||||
|
||||
#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
|
||||
# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
# endif
|
||||
#elif defined( __BIG_ENDIAN )
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
#elif defined( __LITTLE_ENDIAN )
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
#endif
|
||||
|
||||
#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
|
||||
# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
# endif
|
||||
#elif defined( __BIG_ENDIAN__ )
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
#elif defined( __LITTLE_ENDIAN__ )
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
#endif
|
||||
|
||||
/* if the platform byte order could not be determined, then try to */
|
||||
/* set this define using common machine defines */
|
||||
#if !defined(PLATFORM_BYTE_ORDER)
|
||||
|
||||
#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \
|
||||
defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \
|
||||
defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \
|
||||
defined( vax ) || defined( vms ) || defined( VMS ) || \
|
||||
defined( __VMS ) || defined( _M_X64 )
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
|
||||
#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \
|
||||
defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \
|
||||
defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \
|
||||
defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \
|
||||
defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \
|
||||
defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \
|
||||
defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
|
||||
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
|
||||
# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
|
||||
#elif 0 /* **** EDIT HERE IF NECESSARY **** */
|
||||
# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
|
||||
#else
|
||||
# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
234
algo/groestl/aes_ni/brg_types.h
Normal file
234
algo/groestl/aes_ni/brg_types.h
Normal file
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
---------------------------------------------------------------------------
|
||||
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
|
||||
|
||||
(a few lines added by Soeren S. Thomsen, October 2008)
|
||||
|
||||
LICENSE TERMS
|
||||
|
||||
The redistribution and use of this software (with or without changes)
|
||||
is allowed without the payment of fees or royalties provided that:
|
||||
|
||||
1. source code distributions include the above copyright notice, this
|
||||
list of conditions and the following disclaimer;
|
||||
|
||||
2. binary distributions include the above copyright notice, this list
|
||||
of conditions and the following disclaimer in their documentation;
|
||||
|
||||
3. the name of the copyright holder is not used to endorse products
|
||||
built using this software without specific written permission.
|
||||
|
||||
DISCLAIMER
|
||||
|
||||
This software is provided 'as is' with no explicit or implied warranties
|
||||
in respect of its properties, including, but not limited to, correctness
|
||||
and/or fitness for purpose.
|
||||
---------------------------------------------------------------------------
|
||||
Issue Date: 20/12/2007
|
||||
|
||||
The unsigned integer types defined here are of the form uint_<nn>t where
|
||||
<nn> is the length of the type; for example, the unsigned 32-bit type is
|
||||
'uint_32t'. These are NOT the same as the 'C99 integer types' that are
|
||||
defined in the inttypes.h and stdint.h headers since attempts to use these
|
||||
types have shown that support for them is still highly variable. However,
|
||||
since the latter are of the form uint<nn>_t, a regular expression search
|
||||
and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
|
||||
can be used to convert the types used here to the C99 standard types.
|
||||
*/
|
||||
|
||||
#ifndef _BRG_TYPES_H
|
||||
#define _BRG_TYPES_H
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
|
||||
# include <stddef.h>
|
||||
# define ptrint_t intptr_t
|
||||
#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
|
||||
# include <stdint.h>
|
||||
# define ptrint_t intptr_t
|
||||
#else
|
||||
# define ptrint_t int
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI8
|
||||
# define BRG_UI8
|
||||
# if UCHAR_MAX == 255u
|
||||
typedef unsigned char uint_8t;
|
||||
# else
|
||||
# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI16
|
||||
# define BRG_UI16
|
||||
# if USHRT_MAX == 65535u
|
||||
typedef unsigned short uint_16t;
|
||||
# else
|
||||
# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI32
|
||||
# define BRG_UI32
|
||||
# if UINT_MAX == 4294967295u
|
||||
# define li_32(h) 0x##h##u
|
||||
typedef unsigned int uint_32t;
|
||||
# elif ULONG_MAX == 4294967295u
|
||||
# define li_32(h) 0x##h##ul
|
||||
typedef unsigned long uint_32t;
|
||||
# elif defined( _CRAY )
|
||||
# error This code needs 32-bit data types, which Cray machines do not provide
|
||||
# else
|
||||
# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI64
|
||||
# if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ui64
|
||||
typedef unsigned __int64 uint_64t;
|
||||
# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ui64
|
||||
typedef unsigned __int64 uint_64t;
|
||||
# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
# elif defined( __MVS__ )
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned int long long uint_64t;
|
||||
# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
|
||||
# if UINT_MAX == 18446744073709551615u
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##u
|
||||
typedef unsigned int uint_64t;
|
||||
# endif
|
||||
# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
|
||||
# if ULONG_MAX == 18446744073709551615ul
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ul
|
||||
typedef unsigned long uint_64t;
|
||||
# endif
|
||||
# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
|
||||
# if ULLONG_MAX == 18446744073709551615ull
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
# endif
|
||||
# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
|
||||
# if ULONG_LONG_MAX == 18446744073709551615ull
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if !defined( BRG_UI64 )
|
||||
# if defined( NEED_UINT_64T )
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
/*# error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef RETURN_VALUES
|
||||
# define RETURN_VALUES
|
||||
# if defined( DLL_EXPORT )
|
||||
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
|
||||
# define VOID_RETURN __declspec( dllexport ) void __stdcall
|
||||
# define INT_RETURN __declspec( dllexport ) int __stdcall
|
||||
# elif defined( __GNUC__ )
|
||||
# define VOID_RETURN __declspec( __dllexport__ ) void
|
||||
# define INT_RETURN __declspec( __dllexport__ ) int
|
||||
# else
|
||||
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
|
||||
# endif
|
||||
# elif defined( DLL_IMPORT )
|
||||
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
|
||||
# define VOID_RETURN __declspec( dllimport ) void __stdcall
|
||||
# define INT_RETURN __declspec( dllimport ) int __stdcall
|
||||
# elif defined( __GNUC__ )
|
||||
# define VOID_RETURN __declspec( __dllimport__ ) void
|
||||
# define INT_RETURN __declspec( __dllimport__ ) int
|
||||
# else
|
||||
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
|
||||
# endif
|
||||
# elif defined( __WATCOMC__ )
|
||||
# define VOID_RETURN void __cdecl
|
||||
# define INT_RETURN int __cdecl
|
||||
# else
|
||||
# define VOID_RETURN void
|
||||
# define INT_RETURN int
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* These defines are used to detect and set the memory alignment of pointers.
|
||||
Note that offsets are in bytes.
|
||||
|
||||
ALIGN_OFFSET(x,n) return the positive or zero offset of
|
||||
the memory addressed by the pointer 'x'
|
||||
from an address that is aligned on an
|
||||
'n' byte boundary ('n' is a power of 2)
|
||||
|
||||
ALIGN_FLOOR(x,n) return a pointer that points to memory
|
||||
that is aligned on an 'n' byte boundary
|
||||
and is not higher than the memory address
|
||||
pointed to by 'x' ('n' is a power of 2)
|
||||
|
||||
ALIGN_CEIL(x,n) return a pointer that points to memory
|
||||
that is aligned on an 'n' byte boundary
|
||||
and is not lower than the memory address
|
||||
pointed to by 'x' ('n' is a power of 2)
|
||||
*/
|
||||
|
||||
#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1))
|
||||
#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
|
||||
#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
|
||||
|
||||
/* These defines are used to declare buffers in a way that allows
|
||||
faster operations on longer variables to be used. In all these
|
||||
defines 'size' must be a power of 2 and >= 8. NOTE that the
|
||||
buffer size is in bytes but the type length is in bits
|
||||
|
||||
UNIT_TYPEDEF(x,size) declares a variable 'x' of length
|
||||
'size' bits
|
||||
|
||||
BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize'
|
||||
bytes defined as an array of variables
|
||||
each of 'size' bits (bsize must be a
|
||||
multiple of size / 8)
|
||||
|
||||
UNIT_CAST(x,size) casts a variable to a type of
|
||||
length 'size' bits
|
||||
|
||||
UPTR_CAST(x,size) casts a pointer to a pointer to a
|
||||
varaiable of length 'size' bits
|
||||
*/
|
||||
|
||||
#define UI_TYPE(size) uint_##size##t
|
||||
#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x
|
||||
#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)]
|
||||
#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x))
|
||||
#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x))
|
||||
|
||||
/* Added by Soeren S. Thomsen (begin) */
|
||||
#define u8 uint_8t
|
||||
#define u32 uint_32t
|
||||
#define u64 uint_64t
|
||||
/* (end) */
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
1043
algo/groestl/aes_ni/groestl-asm-aes.h
Normal file
1043
algo/groestl/aes_ni/groestl-asm-aes.h
Normal file
File diff suppressed because it is too large
Load Diff
1105
algo/groestl/aes_ni/groestl-asm-avx.h
Normal file
1105
algo/groestl/aes_ni/groestl-asm-avx.h
Normal file
File diff suppressed because it is too large
Load Diff
1397
algo/groestl/aes_ni/groestl-asm-vperm.h
Normal file
1397
algo/groestl/aes_ni/groestl-asm-vperm.h
Normal file
File diff suppressed because it is too large
Load Diff
965
algo/groestl/aes_ni/groestl-intr-aes.h
Normal file
965
algo/groestl/aes_ni/groestl-intr-aes.h
Normal file
@@ -0,0 +1,965 @@
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#if (LENGTH <= 256)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0); \
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
endif\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if (LENGTH > 256)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
|
||||
for(i = 0; i < ROUNDS1024; i++)\
|
||||
{\
|
||||
ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
|
||||
}\
|
||||
}while(0);\
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* SubBytes */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
#define ROUNDS_P(){\
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
|
||||
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
|
||||
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
|
||||
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
|
||||
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
|
||||
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
|
||||
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
|
||||
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
#define ROUNDS_Q(){\
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = ALL_FF;\
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm1);\
|
||||
xmm9 = _mm_xor_si128(xmm9, xmm1);\
|
||||
xmm10 = _mm_xor_si128(xmm10, xmm1);\
|
||||
xmm11 = _mm_xor_si128(xmm11, xmm1);\
|
||||
xmm12 = _mm_xor_si128(xmm12, xmm1);\
|
||||
xmm13 = _mm_xor_si128(xmm13, xmm1);\
|
||||
xmm14 = _mm_xor_si128(xmm14, xmm1);\
|
||||
xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
|
||||
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
|
||||
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
|
||||
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
|
||||
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
|
||||
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = ALL_FF;\
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm9);\
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm9);\
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm9);\
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm9);\
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm9);\
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm9);\
|
||||
xmm6 = _mm_xor_si128(xmm6, xmm9);\
|
||||
xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
|
||||
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
|
||||
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
/* Matrix Transpose
|
||||
* input is a 1024-bit state with two columns in one xmm
|
||||
* output is a 1024-bit state with two rows in one xmm
|
||||
* inputs: i0-i7
|
||||
* outputs: i0-i7
|
||||
* clobbers: t0-t7
|
||||
*/
|
||||
#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i6 = _mm_shuffle_epi8(i6, t0);\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
t1 = i2;\
|
||||
i4 = _mm_shuffle_epi8(i4, t0);\
|
||||
i5 = _mm_shuffle_epi8(i5, t0);\
|
||||
t2 = i4;\
|
||||
t3 = i6;\
|
||||
i7 = _mm_shuffle_epi8(i7, t0);\
|
||||
\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t0 = i0;\
|
||||
t2 = _mm_unpackhi_epi16(t2, i5);\
|
||||
i4 = _mm_unpacklo_epi16(i4, i5);\
|
||||
t3 = _mm_unpackhi_epi16(t3, i7);\
|
||||
i6 = _mm_unpacklo_epi16(i6, i7);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i1);\
|
||||
t1 = _mm_unpackhi_epi16(t1, i3);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
\
|
||||
/* shuffle with immediate */\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
t1 = _mm_shuffle_epi32(t1, 216);\
|
||||
t2 = _mm_shuffle_epi32(t2, 216);\
|
||||
t3 = _mm_shuffle_epi32(t3, 216);\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
i4 = _mm_shuffle_epi32(i4, 216);\
|
||||
i6 = _mm_shuffle_epi32(i6, 216);\
|
||||
\
|
||||
/* continue with unpack */\
|
||||
t4 = i0;\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
t4 = _mm_unpackhi_epi32(t4, i2);\
|
||||
t5 = t0;\
|
||||
t0 = _mm_unpacklo_epi32(t0, t1);\
|
||||
t5 = _mm_unpackhi_epi32(t5, t1);\
|
||||
t6 = i4;\
|
||||
i4 = _mm_unpacklo_epi32(i4, i6);\
|
||||
t7 = t2;\
|
||||
t6 = _mm_unpackhi_epi32(t6, i6);\
|
||||
i2 = t0;\
|
||||
t2 = _mm_unpacklo_epi32(t2, t3);\
|
||||
i3 = t0;\
|
||||
t7 = _mm_unpackhi_epi32(t7, t3);\
|
||||
\
|
||||
/* there are now 2 rows in each xmm */\
|
||||
/* unpack to get 1 row of CV in each xmm */\
|
||||
i1 = i0;\
|
||||
i1 = _mm_unpackhi_epi64(i1, i4);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
i4 = t4;\
|
||||
i3 = _mm_unpackhi_epi64(i3, t2);\
|
||||
i5 = t4;\
|
||||
i2 = _mm_unpacklo_epi64(i2, t2);\
|
||||
i6 = t5;\
|
||||
i5 = _mm_unpackhi_epi64(i5, t6);\
|
||||
i7 = t5;\
|
||||
i4 = _mm_unpacklo_epi64(i4, t6);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t7);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t7);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse
|
||||
* input is a 1024-bit state with two rows in one xmm
|
||||
* output is a 1024-bit state with two columns in one xmm
|
||||
* inputs: i0-i7
|
||||
* outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
|
||||
* clobbers: t0-t4
|
||||
*/
|
||||
#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
|
||||
/* transpose matrix to get output format */\
|
||||
o1 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i1);\
|
||||
t0 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi64(t0, i3);\
|
||||
t1 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
t1 = _mm_unpackhi_epi64(t1, i5);\
|
||||
t2 = i6;\
|
||||
o0 = TRANSP_MASK;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
t2 = _mm_unpackhi_epi64(t2, i7);\
|
||||
/* load transpose mask into a register, because it will be used 8 times */\
|
||||
i0 = _mm_shuffle_epi8(i0, o0);\
|
||||
i2 = _mm_shuffle_epi8(i2, o0);\
|
||||
i4 = _mm_shuffle_epi8(i4, o0);\
|
||||
i6 = _mm_shuffle_epi8(i6, o0);\
|
||||
o1 = _mm_shuffle_epi8(o1, o0);\
|
||||
t0 = _mm_shuffle_epi8(t0, o0);\
|
||||
t1 = _mm_shuffle_epi8(t1, o0);\
|
||||
t2 = _mm_shuffle_epi8(t2, o0);\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t3 = i4;\
|
||||
o2 = o1;\
|
||||
o0 = i0;\
|
||||
t4 = t1;\
|
||||
\
|
||||
t3 = _mm_unpackhi_epi16(t3, i6);\
|
||||
i4 = _mm_unpacklo_epi16(i4, i6);\
|
||||
o0 = _mm_unpackhi_epi16(o0, i2);\
|
||||
i0 = _mm_unpacklo_epi16(i0, i2);\
|
||||
o2 = _mm_unpackhi_epi16(o2, t0);\
|
||||
o1 = _mm_unpacklo_epi16(o1, t0);\
|
||||
t4 = _mm_unpackhi_epi16(t4, t2);\
|
||||
t1 = _mm_unpacklo_epi16(t1, t2);\
|
||||
/* shuffle with immediate */\
|
||||
i4 = _mm_shuffle_epi32(i4, 216);\
|
||||
t3 = _mm_shuffle_epi32(t3, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
o2 = _mm_shuffle_epi32(o2, 216);\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o0 = _mm_shuffle_epi32(o0, 216);\
|
||||
t1 = _mm_shuffle_epi32(t1, 216);\
|
||||
t4 = _mm_shuffle_epi32(t4, 216);\
|
||||
/* continue with unpack */\
|
||||
i1 = i0;\
|
||||
i3 = o0;\
|
||||
i5 = o1;\
|
||||
i7 = o2;\
|
||||
i0 = _mm_unpacklo_epi32(i0, i4);\
|
||||
i1 = _mm_unpackhi_epi32(i1, i4);\
|
||||
o0 = _mm_unpacklo_epi32(o0, t3);\
|
||||
i3 = _mm_unpackhi_epi32(i3, t3);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t1);\
|
||||
i5 = _mm_unpackhi_epi32(i5, t1);\
|
||||
o2 = _mm_unpacklo_epi32(o2, t4);\
|
||||
i7 = _mm_unpackhi_epi32(i7, t4);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
}
|
||||
|
||||
void TF1024(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i QTEMP[8];
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
xmm9 = message[1];
|
||||
xmm10 = message[2];
|
||||
xmm11 = message[3];
|
||||
xmm12 = message[4];
|
||||
xmm13 = message[5];
|
||||
xmm14 = message[6];
|
||||
xmm15 = message[7];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store message M (Q input) for later */
|
||||
QTEMP[0] = xmm8;
|
||||
QTEMP[1] = xmm9;
|
||||
QTEMP[2] = xmm10;
|
||||
QTEMP[3] = xmm11;
|
||||
QTEMP[4] = xmm12;
|
||||
QTEMP[5] = xmm13;
|
||||
QTEMP[6] = xmm14;
|
||||
QTEMP[7] = xmm15;
|
||||
|
||||
/* xor CV to message to get P input */
|
||||
/* result: CV+M in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV+M) in xmm8...xmm15 */
|
||||
ROUNDS_P();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV+M)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
|
||||
/* store P(CV+M)+CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
/* load message M (Q input) into xmm8-15 */
|
||||
xmm8 = QTEMP[0];
|
||||
xmm9 = QTEMP[1];
|
||||
xmm10 = QTEMP[2];
|
||||
xmm11 = QTEMP[3];
|
||||
xmm12 = QTEMP[4];
|
||||
xmm13 = QTEMP[5];
|
||||
xmm14 = QTEMP[6];
|
||||
xmm15 = QTEMP[7];
|
||||
|
||||
/* compute permutation Q */
|
||||
/* result: Q(M) in xmm8...xmm15 */
|
||||
ROUNDS_Q();
|
||||
|
||||
/* xor Q output */
|
||||
/* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF1024(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV) in xmm8...xmm15 */
|
||||
ROUNDS_P();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
|
||||
/* transpose CV back from row ordering to column ordering */
|
||||
/* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
|
||||
Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[4] = xmm0;
|
||||
chaining[5] = xmm6;
|
||||
chaining[6] = xmm13;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
1072
algo/groestl/aes_ni/groestl-intr-avx.h
Normal file
1072
algo/groestl/aes_ni/groestl-intr-avx.h
Normal file
File diff suppressed because it is too large
Load Diff
1294
algo/groestl/aes_ni/groestl-intr-vperm.h
Normal file
1294
algo/groestl/aes_ni/groestl-intr-vperm.h
Normal file
File diff suppressed because it is too large
Load Diff
16
algo/groestl/aes_ni/groestl-version.h
Normal file
16
algo/groestl/aes_ni/groestl-version.h
Normal file
@@ -0,0 +1,16 @@
|
||||
// specify assembly or intrinsics implementation
|
||||
//#define TASM
|
||||
#define TINTR
|
||||
|
||||
//#define AES_NI
|
||||
|
||||
//#ifdef AES_NI
|
||||
// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
|
||||
|
||||
//#ifndef NO_AES_NI
|
||||
|
||||
#define VAES
|
||||
// #define VAVX
|
||||
// #define VVPERM
|
||||
|
||||
//#endif
|
||||
529
algo/groestl/aes_ni/groestl256-asm-aes.h
Normal file
529
algo/groestl/aes_ni/groestl256-asm-aes.h
Normal file
@@ -0,0 +1,529 @@
|
||||
/* groestl-asm-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
/* global constants */
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
|
||||
asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
|
||||
asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
|
||||
asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
|
||||
asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
|
||||
asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
|
||||
asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
|
||||
asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
|
||||
asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
|
||||
asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
|
||||
asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
|
||||
asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
asm("movaps xmm"tostr(b1)", [ALL_1B]");\
|
||||
MUL2(a0, b0, b1);\
|
||||
asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
|
||||
MUL2(a1, b0, b1);\
|
||||
asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
|
||||
MUL2(a2, b0, b1);\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
|
||||
MUL2(a3, b0, b1);\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
|
||||
MUL2(a4, b0, b1);\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
|
||||
MUL2(a5, b0, b1);\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
|
||||
MUL2(a6, b0, b1);\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
|
||||
MUL2(a7, b0, b1);\
|
||||
asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
MUL2(a1, b0, b1);\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
|
||||
MUL2(a2, b0, b1);\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
MUL2(a5, b0, b1);\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
|
||||
MUL2(a6, b0, b1);\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
MUL2(a7, b0, b1);\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
|
||||
asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}while(0);
|
||||
|
||||
#define Push_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}while(0);
|
||||
|
||||
#define Pop_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}while(0);
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
|
||||
\
|
||||
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
|
||||
\
|
||||
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
|
||||
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
|
||||
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("movaps xmm12, [rdi+0*16]");
|
||||
asm ("movaps xmm13, [rdi+1*16]");
|
||||
asm ("movaps xmm14, [rdi+2*16]");
|
||||
asm ("movaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("movaps [rdi+0*16], xmm12");
|
||||
asm ("movaps [rdi+1*16], xmm2");
|
||||
asm ("movaps [rdi+2*16], xmm6");
|
||||
asm ("movaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("movaps xmm12, [rsi+0*16]");
|
||||
asm ("movaps xmm13, [rsi+1*16]");
|
||||
asm ("movaps xmm14, [rsi+2*16]");
|
||||
asm ("movaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm0, [rdi+1*16]");
|
||||
asm ("movaps xmm4, [rdi+2*16]");
|
||||
asm ("movaps xmm5, [rdi+3*16]");
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("pxor xmm8, xmm12");
|
||||
asm ("pxor xmm0, xmm2");
|
||||
asm ("pxor xmm4, xmm6");
|
||||
asm ("pxor xmm5, xmm7");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, xmm8");
|
||||
asm ("pxor xmm1, xmm10");
|
||||
asm ("pxor xmm2, xmm12");
|
||||
asm ("pxor xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, [rdi+0*16]");
|
||||
asm ("pxor xmm1, [rdi+1*16]");
|
||||
asm ("pxor xmm2, [rdi+2*16]");
|
||||
asm ("pxor xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("movaps [rdi+0*16], xmm0");
|
||||
asm ("movaps [rdi+1*16], xmm1");
|
||||
asm ("movaps [rdi+2*16], xmm2");
|
||||
asm ("movaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm10, [rdi+1*16]");
|
||||
asm ("movaps xmm12, [rdi+2*16]");
|
||||
asm ("movaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("pxor xmm8, [rdi+0*16]");
|
||||
asm ("pxor xmm10, [rdi+1*16]");
|
||||
asm ("pxor xmm12, [rdi+2*16]");
|
||||
asm ("pxor xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("movaps [rdi+2*16], xmm9");
|
||||
asm ("movaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
519
algo/groestl/aes_ni/groestl256-asm-avx.h
Normal file
519
algo/groestl/aes_ni/groestl256-asm-avx.h
Normal file
@@ -0,0 +1,519 @@
|
||||
/* groestl-asm-avx.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global variables */
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
|
||||
__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}while(0);
|
||||
|
||||
#define Push_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}while(0);
|
||||
|
||||
#define Pop_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}while(0);
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2(i, j, k, z){\
|
||||
asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
|
||||
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
|
||||
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2v2(i, j, k, z){\
|
||||
asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
|
||||
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
|
||||
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
|
||||
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
|
||||
asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
|
||||
asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
|
||||
asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
|
||||
asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
|
||||
asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
|
||||
asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
|
||||
asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
|
||||
asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
|
||||
\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
|
||||
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
|
||||
\
|
||||
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/*compute v_i: double w_i */\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("vmovaps xmm12, [rdi+0*16]");
|
||||
asm ("vmovaps xmm13, [rdi+1*16]");
|
||||
asm ("vmovaps xmm14, [rdi+2*16]");
|
||||
asm ("vmovaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("vmovaps [rdi+0*16], xmm12");
|
||||
asm ("vmovaps [rdi+1*16], xmm2");
|
||||
asm ("vmovaps [rdi+2*16], xmm6");
|
||||
asm ("vmovaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("vmovaps xmm12, [rsi+0*16]");
|
||||
asm ("vmovaps xmm13, [rsi+1*16]");
|
||||
asm ("vmovaps xmm14, [rsi+2*16]");
|
||||
asm ("vmovaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value and xor message to CV to get input of P */
|
||||
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("vpxor xmm8, xmm12, [rdi+0*16]");
|
||||
asm ("vpxor xmm0, xmm2, [rdi+1*16]");
|
||||
asm ("vpxor xmm4, xmm6, [rdi+2*16]");
|
||||
asm ("vpxor xmm5, xmm7, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("vpxor xmm0, xmm0, xmm8");
|
||||
asm ("vpxor xmm1, xmm1, xmm10");
|
||||
asm ("vpxor xmm2, xmm2, xmm12");
|
||||
asm ("vpxor xmm3, xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("vpxor xmm0, xmm0, [rdi+0*16]");
|
||||
asm ("vpxor xmm1, xmm1, [rdi+1*16]");
|
||||
asm ("vpxor xmm2, xmm2, [rdi+2*16]");
|
||||
asm ("vpxor xmm3, xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("vmovaps [rdi+0*16], xmm0");
|
||||
asm ("vmovaps [rdi+1*16], xmm1");
|
||||
asm ("vmovaps [rdi+2*16], xmm2");
|
||||
asm ("vmovaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("vmovaps xmm8, [rdi+0*16]");
|
||||
asm ("vmovaps xmm10, [rdi+1*16]");
|
||||
asm ("vmovaps xmm12, [rdi+2*16]");
|
||||
asm ("vmovaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("vpxor xmm8, xmm8, [rdi+0*16]");
|
||||
asm ("vpxor xmm10, xmm10, [rdi+1*16]");
|
||||
asm ("vpxor xmm12, xmm12, [rdi+2*16]");
|
||||
asm ("vpxor xmm14, xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("vmovaps [rdi+2*16], xmm9");
|
||||
asm ("vmovaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
856
algo/groestl/aes_ni/groestl256-asm-vperm.h
Normal file
856
algo/groestl/aes_ni/groestl256-asm-vperm.h
Normal file
@@ -0,0 +1,856 @@
|
||||
/* groestl-asm-vperm.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3 instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* Based on the vperm and aes_ni implementations of the hash function Groestl
|
||||
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_15[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_63[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
|
||||
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_SHARED_CONSTANTS(){\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
|
||||
((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
|
||||
((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
|
||||
((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
|
||||
((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
|
||||
((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
|
||||
((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
|
||||
((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
|
||||
((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
|
||||
((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
|
||||
((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
|
||||
((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
|
||||
((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
|
||||
((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
|
||||
((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
|
||||
((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
|
||||
((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
|
||||
((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
|
||||
((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
|
||||
((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
|
||||
((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
|
||||
((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
|
||||
((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
|
||||
((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
|
||||
((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
|
||||
((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
|
||||
((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
|
||||
((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
|
||||
/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
|
||||
((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
|
||||
((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
|
||||
((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
|
||||
((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
|
||||
((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform w/o settings c*
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
|
||||
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
|
||||
asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\
|
||||
asm ("psrld xmm"tostr(t0)", 4");\
|
||||
asm ("psrld xmm"tostr(t1)", 4");\
|
||||
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
|
||||
asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
|
||||
asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
|
||||
asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
|
||||
asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
|
||||
asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\
|
||||
}/**/
|
||||
|
||||
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
|
||||
asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
|
||||
asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
|
||||
asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform State
|
||||
* inputs:
|
||||
* a0-a3 = state
|
||||
* table = transformation table to use
|
||||
* t* = clobbers
|
||||
* outputs:
|
||||
* a0-a3 = transformed state
|
||||
* */
|
||||
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Add Constant to State
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* constant = constant to add
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* a0-a7 = state + constant
|
||||
* */
|
||||
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Set Substitute Core Constants
|
||||
* */
|
||||
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Substitute Core
|
||||
* first part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0 = 1 row
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* b0a, b0b = inputs for lookup step
|
||||
* */
|
||||
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
|
||||
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
|
||||
asm ("psrld xmm"tostr(t0)", 4");\
|
||||
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
|
||||
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\
|
||||
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\
|
||||
asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
|
||||
asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\
|
||||
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
|
||||
asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Lookup
|
||||
* second part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0a, a0b = output of Substitution Core
|
||||
* table = lookup table to use (*1 / *2 / *4)
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* b0 = output of sbox + multiplication
|
||||
* */
|
||||
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
|
||||
asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
|
||||
asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
|
||||
asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
|
||||
asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* SubBytes and *2 / *4
|
||||
* this function is derived from:
|
||||
* Constant-time SSSE3 AES core implementation
|
||||
* by Mike Hamburg
|
||||
* and
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0-a7 = state * 4
|
||||
* c2 = row0 * 2 -> b0
|
||||
* c1 = row7 * 2 -> b3
|
||||
* c0 = row7 * 1 -> b4
|
||||
* t2 = row4 * 1 -> b7
|
||||
* TEMP_MUL1 = row(i) * 1
|
||||
* TEMP_MUL2 = row(i) * 2
|
||||
*
|
||||
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
|
||||
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
|
||||
/* set Constants */\
|
||||
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
|
||||
/* row 1 */\
|
||||
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
|
||||
/* --- */\
|
||||
/* row 2 */\
|
||||
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
|
||||
/* --- */\
|
||||
/* row 3 */\
|
||||
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
|
||||
/* --- */\
|
||||
/* row 5 */\
|
||||
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
|
||||
/* --- */\
|
||||
/* row 6 */\
|
||||
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
|
||||
/* --- */\
|
||||
/* row 7 */\
|
||||
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
|
||||
/* --- */\
|
||||
/* row 4 */\
|
||||
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
|
||||
/* --- */\
|
||||
/* row 0 */\
|
||||
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
|
||||
asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
|
||||
/* --- */\
|
||||
}/**/
|
||||
|
||||
|
||||
/* Optimized MixBytes
|
||||
* inputs:
|
||||
* a0-a7 = (row0-row7) * 4
|
||||
* b0 = row0 * 2
|
||||
* b3 = row7 * 2
|
||||
* b4 = row7 * 1
|
||||
* b7 = row4 * 1
|
||||
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
|
||||
* output: b0-b7
|
||||
* */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* save one value */\
|
||||
asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
|
||||
/* 1 */\
|
||||
asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
|
||||
asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
|
||||
\
|
||||
/* 2 */\
|
||||
asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
|
||||
asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
|
||||
\
|
||||
/* 4 */\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\
|
||||
/*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
|
||||
asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/* 3 */\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
|
||||
/*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
|
||||
asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
|
||||
\
|
||||
/* 5 */\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
/*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\
|
||||
\
|
||||
/* 6 */\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 7 */\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* 8 */\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* 9 */\
|
||||
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 10 */\
|
||||
asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* 11 */\
|
||||
asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* 12 */\
|
||||
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 13 */\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
}/**/
|
||||
|
||||
//#if (LENGTH <= 256)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
SET_SHARED_CONSTANTS();\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}/**/
|
||||
|
||||
#define Push_All_Regs(){\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}/**/
|
||||
|
||||
#define Pop_All_Regs(){\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}/**/
|
||||
|
||||
|
||||
/* vperm:
|
||||
* transformation before rounds with ipt
|
||||
* first round add transformed constant
|
||||
* middle rounds: add constant XOR 0x15...15
|
||||
* last round: additionally add 0x15...15 after MB
|
||||
* transformation after rounds with opt
|
||||
*/
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant + ShiftBytes (interleaved) */\
|
||||
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
/* SubBytes + Multiplication by 2 and 4 */\
|
||||
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}/**/
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
|
||||
}
|
||||
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
|
||||
\
|
||||
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
|
||||
\
|
||||
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
|
||||
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
|
||||
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst_CNT2(i, j){\
|
||||
asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
|
||||
asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
|
||||
VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
|
||||
asm ("pxor xmm0, [ALL_15]");\
|
||||
asm ("pxor xmm1, [ALL_15]");\
|
||||
asm ("pxor xmm2, [ALL_15]");\
|
||||
asm ("pxor xmm3, [ALL_15]");\
|
||||
asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
|
||||
asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
|
||||
asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
|
||||
asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
|
||||
}/**/
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst(){\
|
||||
asm ("movaps xmm0, [ROUND_CONST_Lx]");\
|
||||
VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
|
||||
asm ("pxor xmm0, [ALL_15]");\
|
||||
asm ("movaps [ROUND_CONST_Lx], xmm0");\
|
||||
VPERM_Transform_RoundConst_CNT2(0, 1);\
|
||||
VPERM_Transform_RoundConst_CNT2(2, 3);\
|
||||
VPERM_Transform_RoundConst_CNT2(4, 5);\
|
||||
VPERM_Transform_RoundConst_CNT2(6, 7);\
|
||||
VPERM_Transform_RoundConst_CNT2(8, 9);\
|
||||
}/**/
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
VPERM_Transform_RoundConst();
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("movaps xmm12, [rdi+0*16]");
|
||||
asm ("movaps xmm13, [rdi+1*16]");
|
||||
asm ("movaps xmm14, [rdi+2*16]");
|
||||
asm ("movaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("movaps [rdi+0*16], xmm12");
|
||||
asm ("movaps [rdi+1*16], xmm2");
|
||||
asm ("movaps [rdi+2*16], xmm6");
|
||||
asm ("movaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("movaps xmm12, [rsi+0*16]");
|
||||
asm ("movaps xmm13, [rsi+1*16]");
|
||||
asm ("movaps xmm14, [rsi+2*16]");
|
||||
asm ("movaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm0, [rdi+1*16]");
|
||||
asm ("movaps xmm4, [rdi+2*16]");
|
||||
asm ("movaps xmm5, [rdi+3*16]");
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("pxor xmm8, xmm12");
|
||||
asm ("pxor xmm0, xmm2");
|
||||
asm ("pxor xmm4, xmm6");
|
||||
asm ("pxor xmm5, xmm7");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, xmm8");
|
||||
asm ("pxor xmm1, xmm10");
|
||||
asm ("pxor xmm2, xmm12");
|
||||
asm ("pxor xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, [rdi+0*16]");
|
||||
asm ("pxor xmm1, [rdi+1*16]");
|
||||
asm ("pxor xmm2, [rdi+2*16]");
|
||||
asm ("pxor xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("movaps [rdi+0*16], xmm0");
|
||||
asm ("movaps [rdi+1*16], xmm1");
|
||||
asm ("movaps [rdi+2*16], xmm2");
|
||||
asm ("movaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm10, [rdi+1*16]");
|
||||
asm ("movaps xmm12, [rdi+2*16]");
|
||||
asm ("movaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("pxor xmm8, [rdi+0*16]");
|
||||
asm ("pxor xmm10, [rdi+1*16]");
|
||||
asm ("pxor xmm12, [rdi+2*16]");
|
||||
asm ("pxor xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("movaps [rdi+2*16], xmm9");
|
||||
asm ("movaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
496
algo/groestl/aes_ni/groestl256-intr-aes.h
Normal file
496
algo/groestl/aes_ni/groestl256-intr-aes.h
Normal file
@@ -0,0 +1,496 @@
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0); \
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
|
||||
482
algo/groestl/aes_ni/groestl256-intr-avx.h
Normal file
482
algo/groestl/aes_ni/groestl256-intr-avx.h
Normal file
@@ -0,0 +1,482 @@
|
||||
/* groestl-intr-avx.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_FF;
|
||||
//#if LENGTH <= 256
|
||||
__m128i ALL_1B;
|
||||
//#else
|
||||
//__m256d ALL_1B;
|
||||
//#endif
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
|
||||
#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0);
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2(i, j, k, z){\
|
||||
j = _mm_cmpgt_epi8(z, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
|
||||
b0 = a2;\
|
||||
b1 = a3;\
|
||||
b2 = a4;\
|
||||
b3 = a5;\
|
||||
b4 = a6;\
|
||||
b5 = a7;\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
TEMP1 = b1;\
|
||||
TEMP2 = b2;\
|
||||
\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b1 = a1;\
|
||||
TEMP3 = a2;\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP3);\
|
||||
\
|
||||
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
b1 = ALL_1B;\
|
||||
b2 = _mm_xor_si128(b2, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
a2 = _mm_xor_si128(a2, TEMP2);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/*compute v_i: double w_i */\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
b0 = _mm_xor_si128(a3, TEMP0);\
|
||||
b1 = _mm_xor_si128(a4, TEMP1);\
|
||||
b2 = _mm_xor_si128(a5, TEMP2);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* Add Round Constant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = _mm_unpackhi_epi16(i0, i1);\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
t0 = _mm_unpackhi_epi16(i2, i3);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = _mm_unpackhi_epi32(i0, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o1, t0);\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = _mm_unpackhi_epi64(i0, i4);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o2 = _mm_unpacklo_epi64(i1, i5);\
|
||||
o3 = _mm_unpackhi_epi64(i1, i5);\
|
||||
o4 = _mm_unpacklo_epi64(i2, i6);\
|
||||
o5 = _mm_unpackhi_epi64(i2, i6);\
|
||||
o6 = _mm_unpacklo_epi64(i3, i7);\
|
||||
o7 = _mm_unpackhi_epi64(i3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = _mm_unpackhi_epi64(i0, i1);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi64(i2, i3);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o2 = _mm_unpackhi_epi64(i4, i5);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o3 = _mm_unpackhi_epi64(i6, i7);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i0, t0);\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i2, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i4, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i6, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static __m128i TEMP3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value and xor message to CV to get input of P */
|
||||
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm12, chaining[0]);
|
||||
xmm0 = _mm_xor_si128(xmm2, chaining[1]);
|
||||
xmm4 = _mm_xor_si128(xmm6, chaining[2]);
|
||||
xmm5 = _mm_xor_si128(xmm7, chaining[3]);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, chaining[0]);
|
||||
xmm1 = _mm_xor_si128(xmm1, chaining[1]);
|
||||
xmm2 = _mm_xor_si128(xmm2, chaining[2]);
|
||||
xmm3 = _mm_xor_si128(xmm3, chaining[3]);
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static __m128i TEMP3;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
|
||||
793
algo/groestl/aes_ni/groestl256-intr-vperm.h
Normal file
793
algo/groestl/aes_ni/groestl256-intr-vperm.h
Normal file
@@ -0,0 +1,793 @@
|
||||
/* groestl-intr-vperm.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3 instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* Based on the vperm and aes_ni implementations of the hash function Groestl
|
||||
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <tmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_0F;
|
||||
__m128i ALL_15;
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_63;
|
||||
__m128i ALL_FF;
|
||||
__m128i VPERM_IPT[2];
|
||||
__m128i VPERM_OPT[2];
|
||||
__m128i VPERM_INV[2];
|
||||
__m128i VPERM_SB1[2];
|
||||
__m128i VPERM_SB2[2];
|
||||
__m128i VPERM_SB4[2];
|
||||
__m128i VPERM_SBO[2];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_SHARED_CONSTANTS(){\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
|
||||
ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
|
||||
ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
|
||||
VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
|
||||
VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
|
||||
VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
|
||||
VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
|
||||
VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
|
||||
VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
|
||||
VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
|
||||
VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
|
||||
VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
|
||||
VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
|
||||
VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
|
||||
VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform w/o settings c*
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
|
||||
t0 = c0;\
|
||||
t1 = c0;\
|
||||
t0 = _mm_andnot_si128(t0, a0);\
|
||||
t1 = _mm_andnot_si128(t1, a1);\
|
||||
t0 = _mm_srli_epi32(t0, 4);\
|
||||
t1 = _mm_srli_epi32(t1, 4);\
|
||||
a0 = _mm_and_si128(a0, c0);\
|
||||
a1 = _mm_and_si128(a1, c0);\
|
||||
t2 = c2;\
|
||||
t3 = c2;\
|
||||
t2 = _mm_shuffle_epi8(t2, a0);\
|
||||
t3 = _mm_shuffle_epi8(t3, a1);\
|
||||
a0 = c1;\
|
||||
a1 = c1;\
|
||||
a0 = _mm_shuffle_epi8(a0, t0);\
|
||||
a1 = _mm_shuffle_epi8(a1, t1);\
|
||||
a0 = _mm_xor_si128(a0, t2);\
|
||||
a1 = _mm_xor_si128(a1, t3);\
|
||||
}/**/
|
||||
|
||||
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
|
||||
c0 = ALL_0F;\
|
||||
c1 = ((__m128i*) table )[0];\
|
||||
c2 = ((__m128i*) table )[1];\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform State
|
||||
* inputs:
|
||||
* a0-a3 = state
|
||||
* table = transformation table to use
|
||||
* t* = clobbers
|
||||
* outputs:
|
||||
* a0-a3 = transformed state
|
||||
* */
|
||||
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Add Constant to State
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* constant = constant to add
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* a0-a7 = state + constant
|
||||
* */
|
||||
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
|
||||
t0 = constant;\
|
||||
a0 = _mm_xor_si128(a0, t0);\
|
||||
a1 = _mm_xor_si128(a1, t0);\
|
||||
a2 = _mm_xor_si128(a2, t0);\
|
||||
a3 = _mm_xor_si128(a3, t0);\
|
||||
a4 = _mm_xor_si128(a4, t0);\
|
||||
a5 = _mm_xor_si128(a5, t0);\
|
||||
a6 = _mm_xor_si128(a6, t0);\
|
||||
a7 = _mm_xor_si128(a7, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Set Substitute Core Constants
|
||||
* */
|
||||
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Substitute Core
|
||||
* first part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0 = 1 row
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* b0a, b0b = inputs for lookup step
|
||||
* */
|
||||
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
|
||||
t0 = c0;\
|
||||
t0 = _mm_andnot_si128(t0, a0);\
|
||||
t0 = _mm_srli_epi32(t0, 4);\
|
||||
a0 = _mm_and_si128(a0, c0);\
|
||||
b0a = c1;\
|
||||
b0a = _mm_shuffle_epi8(b0a, a0);\
|
||||
a0 = _mm_xor_si128(a0, t0);\
|
||||
b0b = c2;\
|
||||
b0b = _mm_shuffle_epi8(b0b, t0);\
|
||||
b0b = _mm_xor_si128(b0b, b0a);\
|
||||
t1 = c2;\
|
||||
t1 = _mm_shuffle_epi8(t1, a0);\
|
||||
t1 = _mm_xor_si128(t1, b0a);\
|
||||
b0a = c2;\
|
||||
b0a = _mm_shuffle_epi8(b0a, b0b);\
|
||||
b0a = _mm_xor_si128(b0a, a0);\
|
||||
b0b = c2;\
|
||||
b0b = _mm_shuffle_epi8(b0b, t1);\
|
||||
b0b = _mm_xor_si128(b0b, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Lookup
|
||||
* second part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0a, a0b = output of Substitution Core
|
||||
* table = lookup table to use (*1 / *2 / *4)
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* b0 = output of sbox + multiplication
|
||||
* */
|
||||
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
|
||||
b0 = ((__m128i*) table )[0];\
|
||||
t0 = ((__m128i*) table )[1];\
|
||||
b0 = _mm_shuffle_epi8(b0, a0b);\
|
||||
t0 = _mm_shuffle_epi8(t0, a0a);\
|
||||
b0 = _mm_xor_si128(b0, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* SubBytes and *2 / *4
|
||||
* this function is derived from:
|
||||
* Constant-time SSSE3 AES core implementation
|
||||
* by Mike Hamburg
|
||||
* and
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0-a7 = state * 4
|
||||
* c2 = row0 * 2 -> b0
|
||||
* c1 = row7 * 2 -> b3
|
||||
* c0 = row7 * 1 -> b4
|
||||
* t2 = row4 * 1 -> b7
|
||||
* TEMP_MUL1 = row(i) * 1
|
||||
* TEMP_MUL2 = row(i) * 2
|
||||
*
|
||||
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
|
||||
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
|
||||
/* set Constants */\
|
||||
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
|
||||
/* row 1 */\
|
||||
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[1] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[1] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
|
||||
/* --- */\
|
||||
/* row 2 */\
|
||||
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[2] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[2] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
|
||||
/* --- */\
|
||||
/* row 3 */\
|
||||
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[3] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[3] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
|
||||
/* --- */\
|
||||
/* row 5 */\
|
||||
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[5] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[5] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
|
||||
/* --- */\
|
||||
/* row 6 */\
|
||||
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[6] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[6] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
|
||||
/* --- */\
|
||||
/* row 7 */\
|
||||
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[7] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
|
||||
/* --- */\
|
||||
/* row 4 */\
|
||||
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[4] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
|
||||
/* --- */\
|
||||
/* row 0 */\
|
||||
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
|
||||
TEMP_MUL2[0] = c2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
|
||||
/* --- */\
|
||||
}/**/
|
||||
|
||||
|
||||
/* Optimized MixBytes
|
||||
* inputs:
|
||||
* a0-a7 = (row0-row7) * 4
|
||||
* b0 = row0 * 2
|
||||
* b3 = row7 * 2
|
||||
* b4 = row7 * 1
|
||||
* b7 = row4 * 1
|
||||
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
|
||||
* output: b0-b7
|
||||
* */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* save one value */\
|
||||
TEMP_MUL4 = a3;\
|
||||
/* 1 */\
|
||||
b1 = a0;\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
|
||||
b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
|
||||
b2 = b1;\
|
||||
\
|
||||
/* 2 */\
|
||||
b5 = a1;\
|
||||
b5 = _mm_xor_si128(b5, a4);\
|
||||
b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
|
||||
b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
|
||||
b6 = b5;\
|
||||
\
|
||||
/* 4 */\
|
||||
b7 = _mm_xor_si128(b7, a6);\
|
||||
/*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
|
||||
b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
|
||||
b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
|
||||
b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
|
||||
b2 = _mm_xor_si128(b2, b7);\
|
||||
\
|
||||
/* 3 */\
|
||||
b0 = _mm_xor_si128(b0, a7);\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
|
||||
/*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
|
||||
b3 = b0;\
|
||||
b1 = _mm_xor_si128(b1, b0);\
|
||||
b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
|
||||
\
|
||||
/* 5 */\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
/*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
|
||||
b3 = _mm_xor_si128(b3, b4);\
|
||||
b6 = _mm_xor_si128(b6, b4);\
|
||||
\
|
||||
/* 6 */\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
|
||||
b4 = _mm_xor_si128(b4, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
\
|
||||
/* 7 */\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
|
||||
b2 = _mm_xor_si128(b2, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
\
|
||||
/* 8 */\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
|
||||
b6 = _mm_xor_si128(b6, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
\
|
||||
/* 9 */\
|
||||
a3 = TEMP_MUL1[2];\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* 10 */\
|
||||
a1 = TEMP_MUL1[6];\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
|
||||
b1 = _mm_xor_si128(b1, a1);\
|
||||
b4 = _mm_xor_si128(b4, a1);\
|
||||
\
|
||||
/* 11 */\
|
||||
a5 = TEMP_MUL1[3];\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b6 = _mm_xor_si128(b6, a5);\
|
||||
\
|
||||
/* 12 */\
|
||||
a3 = TEMP_MUL1[7];\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
|
||||
b2 = _mm_xor_si128(b2, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* 13 */\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
}/**/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
SET_SHARED_CONSTANTS();\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}/**/
|
||||
|
||||
/* vperm:
|
||||
* transformation before rounds with ipt
|
||||
* first round add transformed constant
|
||||
* middle rounds: add constant XOR 0x15...15
|
||||
* last round: additionally add 0x15...15 after MB
|
||||
* transformation after rounds with opt
|
||||
*/
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant + ShiftBytes (interleaved) */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + Multiplication by 2 and 4 */\
|
||||
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}/**/
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
|
||||
}
|
||||
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst_CNT2(i, j){\
|
||||
xmm0 = ROUND_CONST_L0[i];\
|
||||
xmm1 = ROUND_CONST_L7[i];\
|
||||
xmm2 = ROUND_CONST_L0[j];\
|
||||
xmm3 = ROUND_CONST_L7[j];\
|
||||
VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
|
||||
xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
|
||||
xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
|
||||
xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
|
||||
ROUND_CONST_L0[i] = xmm0;\
|
||||
ROUND_CONST_L7[i] = xmm1;\
|
||||
ROUND_CONST_L0[j] = xmm2;\
|
||||
ROUND_CONST_L7[j] = xmm3;\
|
||||
}/**/
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst(){\
|
||||
xmm0 = ROUND_CONST_Lx;\
|
||||
VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
|
||||
ROUND_CONST_Lx = xmm0;\
|
||||
VPERM_Transform_RoundConst_CNT2(0, 1);\
|
||||
VPERM_Transform_RoundConst_CNT2(2, 3);\
|
||||
VPERM_Transform_RoundConst_CNT2(4, 5);\
|
||||
VPERM_Transform_RoundConst_CNT2(6, 7);\
|
||||
VPERM_Transform_RoundConst_CNT2(8, 9);\
|
||||
}/**/
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
VPERM_Transform_RoundConst();
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP_MUL1[8];
|
||||
static __m128i TEMP_MUL2[8];
|
||||
static __m128i TEMP_MUL4;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP_MUL1[8];
|
||||
static __m128i TEMP_MUL2[8];
|
||||
static __m128i TEMP_MUL4;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
|
||||
return;
|
||||
}//OF512()
|
||||
|
||||
|
||||
|
||||
306
algo/groestl/aes_ni/hash-groestl.c
Normal file
306
algo/groestl/aes_ni/hash-groestl.c
Normal file
@@ -0,0 +1,306 @@
|
||||
/* hash.c Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl.h"
|
||||
#include "miner.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* digest up to len bytes of input (full blocks only) */
|
||||
void Transform(hashState_groestl *ctx,
|
||||
const u8 *in,
|
||||
unsigned long long len) {
|
||||
/* increment block counter */
|
||||
ctx->block_counter += len/SIZE;
|
||||
|
||||
/* digest message, one block at a time */
|
||||
for (; len >= SIZE; len -= SIZE, in += SIZE)
|
||||
#if LENGTH<=256
|
||||
TF512((u64*)ctx->chaining, (u64*)in);
|
||||
#else
|
||||
TF1024((u64*)ctx->chaining, (u64*)in);
|
||||
#endif
|
||||
|
||||
asm volatile ("emms");
|
||||
}
|
||||
|
||||
/* given state h, do h <- P(h)+h */
|
||||
void OutputTransformation(hashState_groestl *ctx) {
|
||||
/* determine variant */
|
||||
#if (LENGTH <= 256)
|
||||
OF512((u64*)ctx->chaining);
|
||||
#else
|
||||
OF1024((u64*)ctx->chaining);
|
||||
#endif
|
||||
|
||||
asm volatile ("emms");
|
||||
}
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl(hashState_groestl* ctx) {
|
||||
u8 i = 0;
|
||||
/* output size (in bits) must be a positive integer less than or
|
||||
equal to 512, and divisible by 8 */
|
||||
if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512)
|
||||
return BAD_HASHBITLEN_GR;
|
||||
|
||||
/* set number of state columns and state size depending on
|
||||
variant */
|
||||
ctx->columns = COLS;
|
||||
ctx->statesize = SIZE;
|
||||
#if (LENGTH <= 256)
|
||||
ctx->v = SHoRT;
|
||||
#else
|
||||
ctx->v = LoNG;
|
||||
#endif
|
||||
|
||||
SET_CONSTANTS();
|
||||
|
||||
for (i=0; i<SIZE/8; i++)
|
||||
ctx->chaining[i] = 0;
|
||||
for (i=0; i<SIZE; i++)
|
||||
ctx->buffer[i] = 0;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
/* set initial value */
|
||||
ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
|
||||
|
||||
INIT(ctx->chaining);
|
||||
|
||||
/* set other variables */
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->block_counter = 0;
|
||||
ctx->bits_in_last_byte = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr reinit_groestl(hashState_groestl* ctx)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<SIZE/8; i++)
|
||||
ctx->chaining[i] = 0;
|
||||
for (i=0; i<SIZE; i++)
|
||||
ctx->buffer[i] = 0;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
/* set initial value */
|
||||
ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
|
||||
|
||||
INIT(ctx->chaining);
|
||||
|
||||
/* set other variables */
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->block_counter = 0;
|
||||
ctx->bits_in_last_byte = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
/* update state with databitlen bits of input */
|
||||
HashReturn_gr update_groestl(hashState_groestl* ctx,
|
||||
const BitSequence_gr* input,
|
||||
DataLength_gr databitlen) {
|
||||
int index = 0;
|
||||
int msglen = (int)(databitlen/8);
|
||||
int rem = (int)(databitlen%8);
|
||||
|
||||
/* non-integral number of message bytes can only be supplied in the
|
||||
last call to this function */
|
||||
if (ctx->bits_in_last_byte) return FAIL_GR;
|
||||
|
||||
/* if the buffer contains data that has not yet been digested, first
|
||||
add data to buffer until full */
|
||||
|
||||
// The following block of code never gets hit when hashing x11 or quark
|
||||
// leave it here in case it might be needed.
|
||||
// if (ctx->buf_ptr)
|
||||
// {
|
||||
// while (ctx->buf_ptr < ctx->statesize && index < msglen)
|
||||
// {
|
||||
// ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
|
||||
// }
|
||||
// if (ctx->buf_ptr < ctx->statesize)
|
||||
// {
|
||||
// /* buffer still not full, return */
|
||||
// if (rem)
|
||||
// {
|
||||
// ctx->bits_in_last_byte = rem;
|
||||
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
|
||||
// }
|
||||
// return SUCCESS_GR;
|
||||
// }
|
||||
// /* digest buffer */
|
||||
// ctx->buf_ptr = 0;
|
||||
// printf("error\n");
|
||||
// Transform(ctx, ctx->buffer, ctx->statesize);
|
||||
// end dead code
|
||||
// }
|
||||
|
||||
/* digest bulk of message */
|
||||
Transform(ctx, input+index, msglen-index);
|
||||
index += ((msglen-index)/ctx->statesize)*ctx->statesize;
|
||||
|
||||
/* store remaining data in buffer */
|
||||
while (index < msglen)
|
||||
{
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
|
||||
}
|
||||
|
||||
// Another block that doesn't get used by x11 or quark
|
||||
// /* if non-integral number of bytes have been supplied, store
|
||||
// remaining bits in last byte, together with information about
|
||||
// number of bits */
|
||||
// if (rem)
|
||||
// {
|
||||
// ctx->bits_in_last_byte = rem;
|
||||
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
|
||||
// }
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
#define BILB ctx->bits_in_last_byte
|
||||
|
||||
/* finalise: process remaining data (including padding), perform
|
||||
output transformation, and write hash result to 'output' */
|
||||
HashReturn_gr final_groestl(hashState_groestl* ctx,
|
||||
BitSequence_gr* output) {
|
||||
int i, j = 0, hashbytelen = LENGTH/8;
|
||||
u8 *s = (BitSequence_gr*)ctx->chaining;
|
||||
|
||||
/* pad with '1'-bit and first few '0'-bits */
|
||||
if (BILB) {
|
||||
ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
|
||||
ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
|
||||
BILB = 0;
|
||||
}
|
||||
else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
|
||||
|
||||
/* pad with '0'-bits */
|
||||
if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
|
||||
/* padding requires two blocks */
|
||||
while (ctx->buf_ptr < ctx->statesize) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = 0;
|
||||
}
|
||||
/* digest first padding block */
|
||||
Transform(ctx, ctx->buffer, ctx->statesize);
|
||||
ctx->buf_ptr = 0;
|
||||
}
|
||||
while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = 0;
|
||||
}
|
||||
|
||||
/* length padding */
|
||||
ctx->block_counter++;
|
||||
ctx->buf_ptr = ctx->statesize;
|
||||
while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
|
||||
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
|
||||
ctx->block_counter >>= 8;
|
||||
}
|
||||
|
||||
/* digest final padding block */
|
||||
Transform(ctx, ctx->buffer, ctx->statesize);
|
||||
/* perform output transformation */
|
||||
OutputTransformation(ctx);
|
||||
|
||||
/* store hash result in output */
|
||||
for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) {
|
||||
output[j] = s[i];
|
||||
}
|
||||
|
||||
/* zeroise relevant variables and deallocate memory */
|
||||
|
||||
for (i = 0; i < ctx->columns; i++) {
|
||||
ctx->chaining[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < ctx->statesize; i++) {
|
||||
ctx->buffer[i] = 0;
|
||||
}
|
||||
// free(ctx->chaining);
|
||||
// free(ctx->buffer);
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
DataLength_gr databitlen,
|
||||
BitSequence_gr* hashval) {
|
||||
HashReturn_gr ret;
|
||||
hashState_groestl context;
|
||||
|
||||
/* initialise */
|
||||
if ((ret = init_groestl(&context)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* process message */
|
||||
if ((ret = update_groestl(&context, data, databitlen)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* finalise */
|
||||
ret = final_groestl(&context, hashval);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* eBash API */
|
||||
#ifdef crypto_hash_BYTES
|
||||
int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
|
||||
{
|
||||
if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
110
algo/groestl/aes_ni/hash-groestl.h
Normal file
110
algo/groestl/aes_ni/hash-groestl.h
Normal file
@@ -0,0 +1,110 @@
|
||||
/* hash.h Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#ifndef __hash_h
|
||||
#define __hash_h
|
||||
|
||||
#include <stdio.h>
|
||||
#if defined(_WIN64) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
/* eBash API begin */
|
||||
/*
|
||||
#include "crypto_hash.h"
|
||||
#ifdef crypto_hash_BYTES
|
||||
|
||||
#include <crypto_uint8.h>
|
||||
#include <crypto_uint32.h>
|
||||
#include <crypto_uint64.h>
|
||||
typedef crypto_uint8 u8;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint64 u64;
|
||||
#endif
|
||||
* /
|
||||
/* eBash API end */
|
||||
|
||||
#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "brg_types.h"
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
#include IACA_MARKS
|
||||
#endif
|
||||
|
||||
#ifndef LENGTH
|
||||
#define LENGTH (256)
|
||||
#endif
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
#define COLS512 (8)
|
||||
#define COLS1024 (16)
|
||||
#define SIZE512 ((ROWS)*(COLS512))
|
||||
#define SIZE1024 ((ROWS)*(COLS1024))
|
||||
#define ROUNDS512 (10)
|
||||
#define ROUNDS1024 (14)
|
||||
|
||||
#if LENGTH<=256
|
||||
#define COLS (COLS512)
|
||||
#define SIZE (SIZE512)
|
||||
#define ROUNDS (ROUNDS512)
|
||||
#else
|
||||
#define COLS (COLS1024)
|
||||
#define SIZE (SIZE1024)
|
||||
#define ROUNDS (ROUNDS1024)
|
||||
#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif /* IS_BIG_ENDIAN */
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif /* IS_LITTLE_ENDIAN */
|
||||
|
||||
typedef enum { LoNG, SHoRT } Var;
|
||||
|
||||
/* NIST API begin */
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
|
||||
__attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
|
||||
u64 block_counter; /* message block counter */
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
int bits_in_last_byte; /* no. of message bits in last byte of
|
||||
data buffer */
|
||||
int columns; /* no. of columns in state */
|
||||
int statesize; /* total no. of bytes in state */
|
||||
Var v; /* LONG or SHORT */
|
||||
} hashState_groestl;
|
||||
|
||||
HashReturn_gr init_groestl(hashState_groestl*);
|
||||
HashReturn_gr reinit_groestl(hashState_groestl*);
|
||||
HashReturn_gr update_groestl(hashState_groestl*, const BitSequence_gr*, DataLength_gr);
|
||||
HashReturn_gr final_groestl(hashState_groestl*, BitSequence_gr*);
|
||||
HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*);
|
||||
/* NIST API end */
|
||||
|
||||
#endif /* __hash_h */
|
||||
309
algo/groestl/aes_ni/hash-groestl256.c
Normal file
309
algo/groestl/aes_ni/hash-groestl256.c
Normal file
@@ -0,0 +1,309 @@
|
||||
/* hash.c Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
#include "miner.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl256-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl256-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* digest up to len bytes of input (full blocks only) */
|
||||
void Transform256(hashState_groestl256 *ctx,
|
||||
const u8 *in,
|
||||
unsigned long long len) {
|
||||
/* increment block counter */
|
||||
ctx->block_counter += len/SIZE;
|
||||
|
||||
/* digest message, one block at a time */
|
||||
for (; len >= SIZE; len -= SIZE, in += SIZE)
|
||||
//#if LENGTH<=256
|
||||
TF512((u64*)ctx->chaining, (u64*)in);
|
||||
//#else
|
||||
// TF1024((u64*)ctx->chaining, (u64*)in);
|
||||
//#endif
|
||||
|
||||
asm volatile ("emms");
|
||||
}
|
||||
|
||||
/* given state h, do h <- P(h)+h */
|
||||
void OutputTransformation256(hashState_groestl256 *ctx) {
|
||||
/* determine variant */
|
||||
//#if (LENGTH <= 256)
|
||||
OF512((u64*)ctx->chaining);
|
||||
//#else
|
||||
// OF1024((u64*)ctx->chaining);
|
||||
//#endif
|
||||
|
||||
asm volatile ("emms");
|
||||
}
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl256(hashState_groestl256* ctx) {
|
||||
u8 i = 0;
|
||||
/* output size (in bits) must be a positive integer less than or
|
||||
equal to 512, and divisible by 8 */
|
||||
// if (LENGTH <= 0 || (LENGTH%8) || LENGTH > 512)
|
||||
// return BAD_HASHBITLEN_GR;
|
||||
|
||||
/* set number of state columns and state size depending on
|
||||
variant */
|
||||
ctx->columns = COLS;
|
||||
ctx->statesize = SIZE;
|
||||
//#if (LENGTH <= 256)
|
||||
ctx->v = SHoRT;
|
||||
//#else
|
||||
// ctx->v = LoNG;
|
||||
//#endif
|
||||
|
||||
SET_CONSTANTS();
|
||||
|
||||
for (i=0; i<SIZE/8; i++)
|
||||
ctx->chaining[i] = 0;
|
||||
for (i=0; i<SIZE; i++)
|
||||
ctx->buffer[i] = 0;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
/* set initial value */
|
||||
// ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
|
||||
ctx->chaining[ctx->columns-1] = U64BIG((u64)256);
|
||||
|
||||
INIT256(ctx->chaining);
|
||||
|
||||
/* set other variables */
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->block_counter = 0;
|
||||
ctx->bits_in_last_byte = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<SIZE/8; i++)
|
||||
ctx->chaining[i] = 0;
|
||||
for (i=0; i<SIZE; i++)
|
||||
ctx->buffer[i] = 0;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
|
||||
/* set initial value */
|
||||
// ctx->chaining[ctx->columns-1] = U64BIG((u64)LENGTH);
|
||||
ctx->chaining[ctx->columns-1] = 256;
|
||||
|
||||
INIT256(ctx->chaining);
|
||||
|
||||
/* set other variables */
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->block_counter = 0;
|
||||
ctx->bits_in_last_byte = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
/* update state with databitlen bits of input */
|
||||
HashReturn_gr update_groestl256(hashState_groestl256* ctx,
|
||||
const BitSequence_gr* input,
|
||||
DataLength_gr databitlen) {
|
||||
int index = 0;
|
||||
int msglen = (int)(databitlen/8);
|
||||
int rem = (int)(databitlen%8);
|
||||
|
||||
/* non-integral number of message bytes can only be supplied in the
|
||||
last call to this function */
|
||||
if (ctx->bits_in_last_byte) return FAIL_GR;
|
||||
|
||||
/* if the buffer contains data that has not yet been digested, first
|
||||
add data to buffer until full */
|
||||
|
||||
// The following block of code never gets hit when hashing x11 or quark
|
||||
// leave it here in case it might be needed.
|
||||
// if (ctx->buf_ptr)
|
||||
// {
|
||||
// while (ctx->buf_ptr < ctx->statesize && index < msglen)
|
||||
// {
|
||||
// ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
|
||||
// }
|
||||
// if (ctx->buf_ptr < ctx->statesize)
|
||||
// {
|
||||
// /* buffer still not full, return */
|
||||
// if (rem)
|
||||
// {
|
||||
// ctx->bits_in_last_byte = rem;
|
||||
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
|
||||
// }
|
||||
// return SUCCESS_GR;
|
||||
// }
|
||||
// /* digest buffer */
|
||||
// ctx->buf_ptr = 0;
|
||||
// printf("error\n");
|
||||
// Transform(ctx, ctx->buffer, ctx->statesize);
|
||||
// end dead code
|
||||
// }
|
||||
|
||||
/* digest bulk of message */
|
||||
Transform256(ctx, input+index, msglen-index);
|
||||
index += ((msglen-index)/ctx->statesize)*ctx->statesize;
|
||||
|
||||
/* store remaining data in buffer */
|
||||
while (index < msglen)
|
||||
{
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
|
||||
}
|
||||
|
||||
// Another block that doesn't get used by x11 or quark
|
||||
// /* if non-integral number of bytes have been supplied, store
|
||||
// remaining bits in last byte, together with information about
|
||||
// number of bits */
|
||||
// if (rem)
|
||||
// {
|
||||
// ctx->bits_in_last_byte = rem;
|
||||
// ctx->buffer[(int)ctx->buf_ptr++] = input[index];
|
||||
// }
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
#define BILB ctx->bits_in_last_byte
|
||||
|
||||
/* finalise: process remaining data (including padding), perform
|
||||
output transformation, and write hash result to 'output' */
|
||||
HashReturn_gr final_groestl256(hashState_groestl256* ctx,
|
||||
BitSequence_gr* output) {
|
||||
// int i, j = 0, hashbytelen = LENGTH/8;
|
||||
int i, j = 0, hashbytelen = 256/8;
|
||||
u8 *s = (BitSequence_gr*)ctx->chaining;
|
||||
|
||||
/* pad with '1'-bit and first few '0'-bits */
|
||||
if (BILB) {
|
||||
ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<BILB)-1)<<(8-BILB);
|
||||
ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB);
|
||||
BILB = 0;
|
||||
}
|
||||
else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
|
||||
|
||||
/* pad with '0'-bits */
|
||||
if (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
|
||||
/* padding requires two blocks */
|
||||
while (ctx->buf_ptr < ctx->statesize) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = 0;
|
||||
}
|
||||
/* digest first padding block */
|
||||
Transform256(ctx, ctx->buffer, ctx->statesize);
|
||||
ctx->buf_ptr = 0;
|
||||
}
|
||||
while (ctx->buf_ptr < ctx->statesize-LENGTHFIELDLEN) {
|
||||
ctx->buffer[(int)ctx->buf_ptr++] = 0;
|
||||
}
|
||||
|
||||
/* length padding */
|
||||
ctx->block_counter++;
|
||||
ctx->buf_ptr = ctx->statesize;
|
||||
while (ctx->buf_ptr > ctx->statesize-LENGTHFIELDLEN) {
|
||||
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
|
||||
ctx->block_counter >>= 8;
|
||||
}
|
||||
|
||||
/* digest final padding block */
|
||||
Transform256(ctx, ctx->buffer, ctx->statesize);
|
||||
/* perform output transformation */
|
||||
OutputTransformation256(ctx);
|
||||
|
||||
/* store hash result in output */
|
||||
for (i = ctx->statesize-hashbytelen; i < ctx->statesize; i++,j++) {
|
||||
output[j] = s[i];
|
||||
}
|
||||
|
||||
/* zeroise relevant variables and deallocate memory */
|
||||
|
||||
for (i = 0; i < ctx->columns; i++) {
|
||||
ctx->chaining[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < ctx->statesize; i++) {
|
||||
ctx->buffer[i] = 0;
|
||||
}
|
||||
// free(ctx->chaining);
|
||||
// free(ctx->buffer);
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
DataLength_gr databitlen,
|
||||
BitSequence_gr* hashval) {
|
||||
HashReturn_gr ret;
|
||||
hashState_groestl256 context;
|
||||
|
||||
/* initialise */
|
||||
if ((ret = init_groestl256(&context)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* process message */
|
||||
if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* finalise */
|
||||
ret = final_groestl256(&context, hashval);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* eBash API */
|
||||
//#ifdef crypto_hash_BYTES
|
||||
//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
|
||||
//{
|
||||
// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
|
||||
// return -1;
|
||||
//}
|
||||
//#endif
|
||||
|
||||
#endif
|
||||
110
algo/groestl/aes_ni/hash-groestl256.h
Normal file
110
algo/groestl/aes_ni/hash-groestl256.h
Normal file
@@ -0,0 +1,110 @@
|
||||
/* hash.h Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#ifndef __hash_h
|
||||
#define __hash_h
|
||||
|
||||
#include <stdio.h>
|
||||
#if defined(_WIN64) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
/* eBash API begin */
|
||||
/*
|
||||
#include "crypto_hash.h"
|
||||
#ifdef crypto_hash_BYTES
|
||||
|
||||
#include <crypto_uint8.h>
|
||||
#include <crypto_uint32.h>
|
||||
#include <crypto_uint64.h>
|
||||
typedef crypto_uint8 u8;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint64 u64;
|
||||
#endif
|
||||
*/
|
||||
/* eBash API end */
|
||||
|
||||
//#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "brg_types.h"
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
#include IACA_MARKS
|
||||
#endif
|
||||
|
||||
//#ifndef LENGTH
|
||||
//#define LENGTH (256)
|
||||
//#endif
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
#define COLS512 (8)
|
||||
#define COLS1024 (16)
|
||||
#define SIZE512 ((ROWS)*(COLS512))
|
||||
#define SIZE1024 ((ROWS)*(COLS1024))
|
||||
#define ROUNDS512 (10)
|
||||
#define ROUNDS1024 (14)
|
||||
|
||||
//#if LENGTH<=256
|
||||
#define COLS (COLS512)
|
||||
#define SIZE (SIZE512)
|
||||
#define ROUNDS (ROUNDS512)
|
||||
//#else
|
||||
//#define COLS (COLS1024)
|
||||
//#define SIZE (SIZE1024)
|
||||
//#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif /* IS_BIG_ENDIAN */
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif /* IS_LITTLE_ENDIAN */
|
||||
|
||||
typedef enum { LoNG, SHoRT } Var;
|
||||
|
||||
/* NIST API begin */
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
|
||||
__attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
|
||||
u64 block_counter; /* message block counter */
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
int bits_in_last_byte; /* no. of message bits in last byte of
|
||||
data buffer */
|
||||
int columns; /* no. of columns in state */
|
||||
int statesize; /* total no. of bytes in state */
|
||||
Var v; /* LONG or SHORT */
|
||||
} hashState_groestl256;
|
||||
|
||||
HashReturn_gr init_groestl(hashState_groestl256*);
|
||||
HashReturn_gr reinit_groestl(hashState_groestl256*);
|
||||
HashReturn_gr update_groestl(hashState_groestl256*, const BitSequence_gr*, DataLength_gr);
|
||||
HashReturn_gr final_groestl(hashState_groestl256*, BitSequence_gr*);
|
||||
HashReturn_gr hash_groestl(int, const BitSequence_gr*, DataLength_gr, BitSequence_gr*);
|
||||
/* NIST API end */
|
||||
|
||||
#endif /* __hash_h */
|
||||
3
algo/groestl/aes_ni/implementors
Normal file
3
algo/groestl/aes_ni/implementors
Normal file
@@ -0,0 +1,3 @@
|
||||
Krystian Matusiewicz
|
||||
Günther A. Roland
|
||||
Martin Schläffer
|
||||
Reference in New Issue
Block a user