diff --git a/Dockerfile b/Dockerfile index 9db25ee..477e61e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,25 +1,23 @@ # -# Dockerfile for cpuminer -# usage: docker run creack/cpuminer --url xxxx --user xxxx --pass xxxx -# ex: docker run creack/cpuminer --url stratum+tcp://ltc.pool.com:80 --user creack.worker1 --pass abcdef -# +# Dockerfile for cpuminer-opt +# usage: docker build -t cpuminer-opt:latest . +# run: docker run -it --rm cpuminer-opt:latest [ARGS] +# ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3 # -FROM ubuntu:12.10 -MAINTAINER Guillaume J. Charmes +FROM ubuntu:16.04 +RUN BUILD_DEPS="build-essential \ + libssl-dev \ + libgmp-dev \ + libcurl4-openssl-dev \ + libjansson-dev \ + automake" && \ -RUN apt-get update -qq + apt-get update && \ + apt-get install -y ${BUILD_DEPS} -RUN apt-get install -qqy automake -RUN apt-get install -qqy libcurl4-openssl-dev -RUN apt-get install -qqy git -RUN apt-get install -qqy make +COPY . /app/ +RUN cd /app/ && ./build.sh -RUN git clone https://github.com/pooler/cpuminer - -RUN cd cpuminer && ./autogen.sh -RUN cd cpuminer && ./configure CFLAGS="-O3" -RUN cd cpuminer && make - -WORKDIR /cpuminer -ENTRYPOINT ["./cpuminer"] +ENTRYPOINT ["/app/cpuminer"] +CMD ["-h"] diff --git a/Makefile.am b/Makefile.am index cbd8de4..4c2ad08 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,7 +81,9 @@ cpuminer_SOURCES = \ algo/groestl/myr-groestl.c \ algo/groestl/aes_ni/hash-groestl.c \ algo/groestl/aes_ni/hash-groestl256.c \ - algo/haval/haval.c\ + algo/groestl/sse2/grso.c \ + algo/groestl/sse2/grso-asm.c \ + algo/haval/haval.c \ algo/heavy/heavy.c \ algo/heavy/bastion.c \ algo/hmq1725.c \ @@ -119,6 +121,7 @@ cpuminer_SOURCES = \ algo/scrypt.c \ algo/scryptjane/scrypt-jane.c \ algo/sha2/sha2.c \ + algo/sha2/sha256t.c \ algo/simd/sse2/nist.c \ algo/simd/sse2/vector.c \ algo/skein/skein.c \ diff --git a/algo-gate-api.c b/algo-gate-api.c index 0901e3e..b5126bb 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -179,7 +179,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break; case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break; case ALGO_LYRA2Z: register_zcoin_algo ( gate ); break; - case ALGO_LYRA2ZOIN: register_zoin_algo ( gate ); break; + case ALGO_LYRA2Z330: register_lyra2z330_algo ( gate ); break; case ALGO_M7M: register_m7m_algo ( gate ); break; case ALGO_MYR_GR: register_myriad_algo ( gate ); break; case ALGO_NEOSCRYPT: register_neoscrypt_algo ( gate ); break; @@ -191,6 +191,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_SCRYPT: register_scrypt_algo ( gate ); break; case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break; case ALGO_SHA256D: register_sha256d_algo ( gate ); break; + case ALGO_SHA256T: register_sha256t_algo ( gate ); break; case ALGO_SHAVITE3: register_shavite_algo ( gate ); break; case ALGO_SKEIN: register_skein_algo ( gate ); break; case ALGO_SKEIN2: register_skein2_algo ( gate ); break; @@ -281,13 +282,14 @@ const char* const algo_alias_map[][2] = { "jane", "scryptjane" }, { "lyra2", "lyra2re" }, { "lyra2v2", "lyra2rev2" }, + { "lyra2zoin", "lyra2z330" }, { "myriad", "myr-gr" }, { "neo", "neoscrypt" }, { "sib", "x11gost" }, { "yes", "yescrypt" }, { "ziftr", "zr5" }, { "zcoin", "lyra2z" }, - { "zoin", "lyra2zoin" }, + { "zoin", "lyra2z330" }, { NULL, NULL } }; diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c index 5dd0d1e..6435c61 100644 --- a/algo/blake/blake2b.c +++ b/algo/blake/blake2b.c @@ -129,6 +129,8 @@ void blake2b_be_build_stratum_request( char *req, struct work *work ) free( xnonce2str ); } +#define min(a,b) (a>b ? (b) :(a)) + // merkle root handled here, no need for gen_merkle_root gate target void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) { @@ -161,6 +163,8 @@ void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) g_work->data[12+i] = ( (uint32_t*)merkle_root )[i]; } +#undef min + void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id, uint32_t* end_nonce_ptr, bool clean_job ) { diff --git a/algo/blake/decred.c b/algo/blake/decred.c index f4bc725..933da7e 100644 --- a/algo/blake/decred.c +++ b/algo/blake/decred.c @@ -194,6 +194,9 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize); } */ + +#define min(a,b) (a>b ? (b) :(a)) + void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) { uchar merkle_root[64] = { 0 }; @@ -239,6 +242,8 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) //applog_hex(&work->data[36], 36); } +#undef min + /* bool decred_prevent_dupes( struct work* work, struct stratum_ctx* stratum, int thr_id ) diff --git a/algo/groestl/sse2/brg_endian.h b/algo/groestl/sse2/brg_endian.h new file mode 100644 index 0000000..e3cf0d1 --- /dev/null +++ b/algo/groestl/sse2/brg_endian.h @@ -0,0 +1,133 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/algo/groestl/sse2/brg_types.h b/algo/groestl/sse2/brg_types.h new file mode 100644 index 0000000..0452655 --- /dev/null +++ b/algo/groestl/sse2/brg_types.h @@ -0,0 +1,231 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + (a few lines added by Soeren S. Thomsen, October 2008) + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form uint_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef _BRG_TYPES_H +#define _BRG_TYPES_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 ) +# include +# define ptrint_t intptr_t +#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 ) +# include +# define ptrint_t intptr_t +#else +# define ptrint_t int +#endif + +#ifndef BRG_UI8 +# define BRG_UI8 +# if UCHAR_MAX == 255u + typedef unsigned char uint_8t; +# else +# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI16 +# define BRG_UI16 +# if USHRT_MAX == 65535u + typedef unsigned short uint_16t; +# else +# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h +# endif +#endif + +#ifndef BRG_UI32 +# define BRG_UI32 +# if UINT_MAX == 4294967295u +# define li_32(h) 0x##h##u + typedef unsigned int uint_32t; +# elif ULONG_MAX == 4294967295u +# define li_32(h) 0x##h##ul + typedef unsigned long uint_32t; +# elif defined( _CRAY ) +# error This code needs 32-bit data types, which Cray machines do not provide +# else +# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI64 +# if defined( __BORLANDC__ ) && !defined( __MSDOS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */ +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# elif defined( __MVS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned int long long uint_64t; +# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u +# if UINT_MAX == 18446744073709551615u +# define BRG_UI64 +# define li_64(h) 0x##h##u + typedef unsigned int uint_64t; +# endif +# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u +# if ULONG_MAX == 18446744073709551615ul +# define BRG_UI64 +# define li_64(h) 0x##h##ul + typedef unsigned long uint_64t; +# endif +# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u +# if ULLONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u +# if ULONG_LONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# endif +#endif + +#if !defined( BRG_UI64 ) +# if defined( NEED_UINT_64T ) +# error Please define uint_64t as an unsigned 64 bit type in brg_types.h +# endif +#endif + +#ifndef RETURN_VALUES +# define RETURN_VALUES +# if defined( DLL_EXPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllexport ) void __stdcall +# define INT_RETURN __declspec( dllexport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllexport__ ) void +# define INT_RETURN __declspec( __dllexport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( DLL_IMPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllimport ) void __stdcall +# define INT_RETURN __declspec( dllimport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllimport__ ) void +# define INT_RETURN __declspec( __dllimport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( __WATCOMC__ ) +# define VOID_RETURN void __cdecl +# define INT_RETURN int __cdecl +# else +# define VOID_RETURN void +# define INT_RETURN int +# endif +#endif + +/* These defines are used to detect and set the memory alignment of pointers. + Note that offsets are in bytes. + + ALIGN_OFFSET(x,n) return the positive or zero offset of + the memory addressed by the pointer 'x' + from an address that is aligned on an + 'n' byte boundary ('n' is a power of 2) + + ALIGN_FLOOR(x,n) return a pointer that points to memory + that is aligned on an 'n' byte boundary + and is not higher than the memory address + pointed to by 'x' ('n' is a power of 2) + + ALIGN_CEIL(x,n) return a pointer that points to memory + that is aligned on an 'n' byte boundary + and is not lower than the memory address + pointed to by 'x' ('n' is a power of 2) +*/ + +#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1)) +#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1))) +#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1))) + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8. NOTE that the + buffer size is in bytes but the type length is in bits + + UNIT_TYPEDEF(x,size) declares a variable 'x' of length + 'size' bits + + BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + UNIT_CAST(x,size) casts a variable to a type of + length 'size' bits + + UPTR_CAST(x,size) casts a pointer to a pointer to a + varaiable of length 'size' bits +*/ + +#define UI_TYPE(size) uint_##size##t +#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x +#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)] +#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x)) +#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x)) + + /* Added by Soeren S. Thomsen (begin) */ +#define u8 uint_8t +#define u32 uint_32t +#define u64 uint_64t + /* (end) */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/algo/groestl/sse2/grso-asm.c b/algo/groestl/sse2/grso-asm.c new file mode 100644 index 0000000..474ebf0 --- /dev/null +++ b/algo/groestl/sse2/grso-asm.c @@ -0,0 +1,1063 @@ +/* mmx optimized asm */ + +#include "grso-asm.h" + +void grsoP1024ASM (u64 *x) { + asm ( + "\n movq 8(%0), %%rcx" + "\n movq 24(%0), %%rdx" + "\n movq $0, 8(%0)" + "\n 1:" + + "\n movq 0(%0), %%rax" + "\n movq 16(%0), %%rbx" + + "\n xorq $0x10, %%rcx" + "\n xorq $0x30, %%rdx" + "\n xorq 8(%0), %%rcx" + "\n xorq 8(%0), %%rdx" + "\n xorq $0x20, %%rbx" + "\n xorq 8(%0), %%rax" + "\n xorq 8(%0), %%rbx" + + "\n # processing input words x[1]=rcx and x[3]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT0(,%%rdi,8), %%mm1" + "\n movq grsoT1(,%%rsi,8), %%mm0" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n movq grsoT0(,%%rsi,8), %%mm3" + "\n movq grsoT1(,%%rdi,8), %%mm2" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT2(,%%rdi,8), %%r15" + "\n movq grsoT3(,%%rsi,8), %%r14" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm1" + "\n pxor grsoT3(,%%rdi,8), %%mm0" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT4(,%%rdi,8), %%r13" + "\n movq grsoT5(,%%rsi,8), %%r12" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r15" + "\n xorq grsoT5(,%%rdi,8), %%r14" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT6(,%%rdi,8), %%r11" + "\n movq grsoT7(,%%rsi,8), %%mm6" + "\n movzbl %%dl, %%edi" + "\n movzbl %%dh, %%esi" + "\n xorq grsoT6(,%%rdi,8), %%r13" + "\n movq grsoT7(,%%rsi,8), %%r8" + + + + "\n movq 40(%0), %%rcx" + "\n movq 56(%0), %%rdx" + + "\n xorq $0x50, %%rcx" + "\n xorq $0x70, %%rdx" + "\n xorq 8(%0), %%rcx" + "\n xorq 8(%0), %%rdx" + + + "\n # processing input words x[0]=rax and x[2]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT0(,%%rdi,8), %%mm0" + "\n xorq grsoT1(,%%rsi,8), %%r15" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT0(,%%rsi,8), %%mm2" + "\n pxor grsoT1(,%%rdi,8), %%mm1" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT2(,%%rdi,8), %%r14" + "\n xorq grsoT3(,%%rsi,8), %%r13" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm0" + "\n xorq grsoT3(,%%rdi,8), %%r15" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT4(,%%rdi,8), %%r12" + "\n xorq grsoT5(,%%rsi,8), %%r11" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r14" + "\n xorq grsoT5(,%%rdi,8), %%r13" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n movq grsoT6(,%%rdi,8), %%r10" + "\n movq grsoT7(,%%rsi,8), %%mm5" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT6(,%%rsi,8), %%r12" + "\n movq grsoT7(,%%rdi,8), %%mm7" + + + + "\n movq 32(%0), %%rax" + "\n movq 48(%0), %%rbx" + + "\n xorq $0x40, %%rax" + "\n xorq $0x60, %%rbx" + "\n xorq 8(%0), %%rax" + "\n xorq 8(%0), %%rbx" + + "\n # processing input words x[5]=rcx and x[7]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT0(,%%rdi,8), %%mm5" + "\n movq grsoT1(,%%rsi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT0(,%%rsi,8), %%mm7" + "\n pxor grsoT1(,%%rdi,8), %%mm6" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm3" + "\n pxor grsoT3(,%%rsi,8), %%mm2" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm5" + "\n pxor grsoT3(,%%rdi,8), %%mm4" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT4(,%%rdi,8), %%mm1" + "\n pxor grsoT5(,%%rsi,8), %%mm0" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT4(,%%rsi,8), %%mm3" + "\n pxor grsoT5(,%%rdi,8), %%mm2" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT6(,%%rdi,8), %%r15" + "\n xorq grsoT7(,%%rsi,8), %%r10" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT6(,%%rsi,8), %%mm1" + "\n xorq grsoT7(,%%rdi,8), %%r12" + + + + "\n movq 72(%0), %%rcx" + "\n movq 88(%0), %%rdx" + + "\n xorq $0x90, %%rcx" + "\n xorq $0xb0, %%rdx" + "\n xorq 8(%0), %%rcx" + "\n xorq 8(%0), %%rdx" + + "\n # processing input words x[4]=rax and x[6]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT0(,%%rdi,8), %%mm4" + "\n pxor grsoT1(,%%rsi,8), %%mm3" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT0(,%%rsi,8), %%mm6" + "\n pxor grsoT1(,%%rdi,8), %%mm5" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm2" + "\n pxor grsoT3(,%%rsi,8), %%mm1" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm4" + "\n pxor grsoT3(,%%rdi,8), %%mm3" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT4(,%%rdi,8), %%mm0" + "\n xorq grsoT5(,%%rsi,8), %%r15" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT4(,%%rsi,8), %%mm2" + "\n pxor grsoT5(,%%rdi,8), %%mm1" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT6(,%%rdi,8), %%r14" + "\n movq grsoT7(,%%rsi,8), %%r9" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT6(,%%rsi,8), %%mm0" + "\n xorq grsoT7(,%%rdi,8), %%r11" + + + "\n movq 64(%0), %%rax" + "\n movq 80(%0), %%rbx" + + "\n xorq $0x80, %%rax" + "\n xorq $0xa0, %%rbx" + "\n xorq 8(%0), %%rax" + "\n xorq 8(%0), %%rbx" + + "\n # processing input words x[9]=rcx and x[11]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r9" + "\n xorq grsoT1(,%%rsi,8), %%r8" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r11" + "\n xorq grsoT1(,%%rdi,8), %%r10" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm7" + "\n pxor grsoT3(,%%rsi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r9" + "\n xorq grsoT3(,%%rdi,8), %%r8" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT4(,%%rdi,8), %%mm5" + "\n pxor grsoT5(,%%rsi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT4(,%%rsi,8), %%mm7" + "\n pxor grsoT5(,%%rdi,8), %%mm6" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT6(,%%rdi,8), %%mm3" + "\n xorq grsoT7(,%%rsi,8), %%r14" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT6(,%%rsi,8), %%mm5" + "\n pxor grsoT7(,%%rdi,8), %%mm0" + + + + "\n movq 104(%0), %%rcx" + "\n movq 120(%0), %%rdx" + + "\n xorq $0xd0, %%rcx" + "\n xorq $0xf0, %%rdx" + "\n xorq 8(%0), %%rcx" + "\n xorq 8(%0), %%rdx" + + "\n # processing input words x[8]=rax and x[10]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r8" + "\n pxor grsoT1(,%%rsi,8), %%mm7" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r10" + "\n xorq grsoT1(,%%rdi,8), %%r9" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm6" + "\n pxor grsoT3(,%%rsi,8), %%mm5" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r8" + "\n pxor grsoT3(,%%rdi,8), %%mm7" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT4(,%%rdi,8), %%mm4" + "\n pxor grsoT5(,%%rsi,8), %%mm3" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT4(,%%rsi,8), %%mm6" + "\n pxor grsoT5(,%%rdi,8), %%mm5" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT6(,%%rdi,8), %%mm2" + "\n xorq grsoT7(,%%rsi,8), %%r13" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT6(,%%rsi,8), %%mm4" + "\n xorq grsoT7(,%%rdi,8), %%r15" + + "\n movq 96(%0), %%rax" + "\n movq 112(%0), %%rbx" + + "\n xorq $0xc0, %%rax" + "\n xorq $0xe0, %%rbx" + "\n xorq 8(%0), %%rax" + "\n xorq 8(%0), %%rbx" + + "\n # processing input words x[13]=rcx and x[15]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r13" + "\n xorq grsoT1(,%%rsi,8), %%r12" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r15" + "\n xorq grsoT1(,%%rdi,8), %%r14" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT2(,%%rdi,8), %%r11" + "\n xorq grsoT3(,%%rsi,8), %%r10" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r13" + "\n xorq grsoT3(,%%rdi,8), %%r12" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT4(,%%rdi,8), %%r9" + "\n xorq grsoT5(,%%rsi,8), %%r8" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r11" + "\n xorq grsoT5(,%%rdi,8), %%r10" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT6(,%%rdi,8), %%mm7" + "\n pxor grsoT7(,%%rsi,8), %%mm2" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT6(,%%rsi,8), %%r9" + "\n pxor grsoT7(,%%rdi,8), %%mm4" + + + + "\n # processing input words x[12]=rax and x[14]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r12" + "\n xorq grsoT1(,%%rsi,8), %%r11" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r14" + "\n xorq grsoT1(,%%rdi,8), %%r13" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT2(,%%rdi,8), %%r10" + "\n xorq grsoT3(,%%rsi,8), %%r9" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r12" + "\n xorq grsoT3(,%%rdi,8), %%r11" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT4(,%%rdi,8), %%r8" + "\n pxor grsoT5(,%%rsi,8), %%mm7" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r10" + "\n xorq grsoT5(,%%rdi,8), %%r9" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT6(,%%rdi,8), %%mm6" + "\n pxor grsoT7(,%%rsi,8), %%mm1" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT6(,%%rsi,8), %%r8" + "\n pxor grsoT7(,%%rdi,8), %%mm3" + + "\n incq 8(%0) #increment counter" + + "\n movq 8(%0), %%rdi" + "\n cmp $14, %%edi" + "\n je 2f" + "\n movq %%mm1, %%rcx" + "\n movq %%mm3, %%rdx" + "\n movq %%mm0, 0(%0)" + "\n movq %%mm2, 16(%0)" + "\n movq %%mm4, 32(%0)" + "\n movq %%mm5, 40(%0)" + "\n movq %%mm6, 48(%0)" + "\n movq %%mm7, 56(%0)" + "\n movq %%r8 , 64(%0)" + "\n movq %%r9 , 72(%0)" + "\n movq %%r10, 80(%0)" + "\n movq %%r11, 88(%0)" + "\n movq %%r12, 96(%0)" + "\n movq %%r13, 104(%0)" + "\n movq %%r14, 112(%0)" + "\n movq %%r15, 120(%0)" + "\n jmp 1b" + "\n 2:" + "\n movq %%mm0, 0(%0)" + "\n movq %%mm1, 8(%0)" + "\n movq %%mm2, 16(%0)" + "\n movq %%mm3, 24(%0)" + "\n movq %%mm4, 32(%0)" + "\n movq %%mm5, 40(%0)" + "\n movq %%mm6, 48(%0)" + "\n movq %%mm7, 56(%0)" + "\n movq %%r8 , 64(%0)" + "\n movq %%r9 , 72(%0)" + "\n movq %%r10, 80(%0)" + "\n movq %%r11, 88(%0)" + "\n movq %%r12, 96(%0)" + "\n movq %%r13, 104(%0)" + "\n movq %%r14, 112(%0)" + "\n movq %%r15, 120(%0)" + : /*no output, only memory is modified */ + : "r"(x) + : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%rsi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory" , "%mm0", "%mm1", "%mm2" , "%mm3" , "%mm4" , "%mm5" , "%mm6" , "%mm7" ); +}//P512ASM() + + +void grsoQ1024ASM (u64 *x) { + asm ( + "\n movq 8(%0), %%rcx" + "\n movq 24(%0), %%rdx" + "\n movq $0, 8(%0)" + "\n 1:" + + "\n movq 0(%0), %%rax" + "\n movq 16(%0), %%rbx" + + /* add round constants to columns 0-3 */ + "\n movq $0xffffffffffffffff, %%r14" + "\n movq $0xefffffffffffffff, %%r15" + "\n xorq %%r14, %%rax" + "\n xorq %%r15, %%rcx" + "\n movq $0xdfffffffffffffff, %%r14" + "\n movq $0xcfffffffffffffff, %%r15" + "\n xorq %%r14, %%rbx" + "\n xorq %%r15, %%rdx" + + "\n # processing input words x[1]=rcx and x[3]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT0(,%%rdi,8), %%mm0" + "\n movq grsoT1(,%%rsi,8), %%r14" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n movq grsoT0(,%%rsi,8), %%mm2" + "\n pxor grsoT1(,%%rdi,8), %%mm0" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT2(,%%rdi,8), %%r12" + "\n movq grsoT3(,%%rsi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r14" + "\n movq grsoT3(,%%rdi,8), %%r8" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT4(,%%rdi,8), %%mm1" + "\n movq grsoT5(,%%rsi,8), %%r15" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n movq grsoT4(,%%rsi,8), %%mm3" + "\n pxor grsoT5(,%%rdi,8), %%mm1" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq 8(%0), %%rsi" + "\n movq grsoT6(,%%rdi,8), %%r13" + "\n movq grsoT7(,%%rsi,8), %%r11" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n xorq grsoT6(,%%rsi,8), %%r15" + "\n xorq grsoT7(,%%rdi,8), %%r13" + + + "\n # processing input words x[0]=rax and x[2]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r15" + "\n xorq grsoT1(,%%rsi,8), %%r13" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT0(,%%rsi,8), %%mm1" + "\n xorq grsoT1(,%%rdi,8), %%r15" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT2(,%%rdi,8), %%r11" + "\n movq grsoT3(,%%rsi,8), %%mm5" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r13" + "\n movq grsoT3(,%%rdi,8), %%mm7" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT4(,%%rdi,8), %%mm0" + "\n xorq grsoT5(,%%rsi,8), %%r14" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT4(,%%rsi,8), %%mm2" + "\n pxor grsoT5(,%%rdi,8), %%mm0" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq 8(%0), %%rsi" + "\n xorq grsoT6(,%%rdi,8), %%r12" + "\n movq grsoT7(,%%rsi,8), %%r10" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n xorq grsoT6(,%%rsi,8), %%r14" + "\n xorq grsoT7(,%%rdi,8), %%r12" + + /* read columns 4-7 from registers and add round constants to these */ + "\n movq %%r14, 128(%0)" + "\n movq %%r15, 136(%0)" + + "\n movq 32(%0), %%rax" /* read input column 4 */ + "\n movq 40(%0), %%rcx" /* read input column 5 */ + "\n movq 48(%0), %%rbx" /* read input column 6 */ + "\n movq 56(%0), %%rdx" /* read input column 7 */ + + "\n movq $0xbfffffffffffffff, %%r14" + "\n movq $0xafffffffffffffff, %%r15" + "\n xorq %%r14, %%rax" + "\n xorq %%r15, %%rcx" + "\n movq $0x9fffffffffffffff, %%r14" + "\n movq $0x8fffffffffffffff, %%r15" + "\n xorq %%r14, %%rbx" + "\n xorq %%r15, %%rdx" + + "\n movq 128(%0), %%r14" + "\n movq 136(%0), %%r15" + + "\n # processing input words x[5]=rcx and x[7]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n movq grsoT0(,%%rdi,8), %%mm4" + "\n pxor grsoT1(,%%rsi,8), %%mm2" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT0(,%%rsi,8), %%mm6" + "\n pxor grsoT1(,%%rdi,8), %%mm4" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm0" + "\n xorq grsoT3(,%%rsi,8), %%r10" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm2" + "\n xorq grsoT3(,%%rdi,8), %%r12" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT4(,%%rdi,8), %%mm5" + "\n pxor grsoT5(,%%rsi,8), %%mm3" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT4(,%%rsi,8), %%mm7" + "\n pxor grsoT5(,%%rdi,8), %%mm5" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq 8(%0), %%rsi" + "\n pxor grsoT6(,%%rdi,8), %%mm1" + "\n xorq grsoT7(,%%rsi,8), %%r15" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n pxor grsoT6(,%%rsi,8), %%mm3" + "\n pxor grsoT7(,%%rdi,8), %%mm1" + + + "\n # processing input words x[4]=rax and x[6]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT0(,%%rdi,8), %%mm3" + "\n pxor grsoT1(,%%rsi,8), %%mm1" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT0(,%%rsi,8), %%mm5" + "\n pxor grsoT1(,%%rdi,8), %%mm3" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT2(,%%rdi,8), %%r15" + "\n movq grsoT3(,%%rsi,8), %%r9" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm1" + "\n xorq grsoT3(,%%rdi,8), %%r11" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT4(,%%rdi,8), %%mm4" + "\n pxor grsoT5(,%%rsi,8), %%mm2" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT4(,%%rsi,8), %%mm6" + "\n pxor grsoT5(,%%rdi,8), %%mm4" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq 8(%0), %%rsi" + "\n pxor grsoT6(,%%rdi,8), %%mm0" + "\n xorq grsoT7(,%%rsi,8), %%r14" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n pxor grsoT6(,%%rsi,8), %%mm2" + "\n pxor grsoT7(,%%rdi,8), %%mm0" + + /* read columns 8-11 from registers and add round constants to these */ + "\n movq %%r14, 128(%0)" + "\n movq %%r15, 136(%0)" + + "\n movq 64(%0), %%rax" /* read input column 8 */ + "\n movq 72(%0), %%rcx" /* read input column 9 */ + "\n movq 80(%0), %%rbx" /* read input column 10 */ + "\n movq 88(%0), %%rdx" /* read input column 11 */ + + "\n movq $0x7fffffffffffffff, %%r14" + "\n movq $0x6fffffffffffffff, %%r15" + "\n xorq %%r14, %%rax" + "\n xorq %%r15, %%rcx" + "\n movq $0x5fffffffffffffff, %%r14" + "\n movq $0x4fffffffffffffff, %%r15" + "\n xorq %%r14, %%rbx" + "\n xorq %%r15, %%rdx" + + "\n movq 128(%0), %%r14" + "\n movq 136(%0), %%r15" + + + "\n # processing input words x[9]=rcx and x[11]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r8" + "\n pxor grsoT1(,%%rsi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r10" + "\n xorq grsoT1(,%%rdi,8), %%r8" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm4" + "\n xorq grsoT3(,%%rsi,8), %%r14" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm6" + "\n pxor grsoT3(,%%rdi,8), %%mm0" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT4(,%%rdi,8), %%r9" + "\n pxor grsoT5(,%%rsi,8), %%mm7" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r11" + "\n xorq grsoT5(,%%rdi,8), %%r9" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq 8(%0), %%rsi" + "\n pxor grsoT6(,%%rdi,8), %%mm5" + "\n pxor grsoT7(,%%rsi,8), %%mm3" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n pxor grsoT6(,%%rsi,8), %%mm7" + "\n pxor grsoT7(,%%rdi,8), %%mm5" + + + + "\n # processing input words x[8]=rax and x[10]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT0(,%%rdi,8), %%mm7" + "\n pxor grsoT1(,%%rsi,8), %%mm5" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r9" + "\n pxor grsoT1(,%%rdi,8), %%mm7" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm3" + "\n xorq grsoT3(,%%rsi,8), %%r13" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n pxor grsoT2(,%%rsi,8), %%mm5" + "\n xorq grsoT3(,%%rdi,8), %%r15" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT4(,%%rdi,8), %%r8" + "\n pxor grsoT5(,%%rsi,8), %%mm6" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r10" + "\n xorq grsoT5(,%%rdi,8), %%r8" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq 8(%0), %%rsi" + "\n pxor grsoT6(,%%rdi,8), %%mm4" + "\n pxor grsoT7(,%%rsi,8), %%mm2" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n pxor grsoT6(,%%rsi,8), %%mm6" + "\n pxor grsoT7(,%%rdi,8), %%mm4" + + + /* read columns 12-15 from registers and add round constants to these */ + "\n movq %%r14, 128(%0)" + "\n movq %%r15, 136(%0)" + + "\n movq 96(%0), %%rax" /* read input column 12 */ + "\n movq 104(%0), %%rcx" /* read input column 13 */ + "\n movq 112(%0), %%rbx" /* read input column 14 */ + "\n movq 120(%0), %%rdx" /* read input column 15 */ + + "\n movq $0x3fffffffffffffff, %%r14" + "\n movq $0x2fffffffffffffff, %%r15" + "\n xorq %%r14, %%rax" + "\n xorq %%r15, %%rcx" + "\n movq $0x1fffffffffffffff, %%r14" + "\n movq $0x0fffffffffffffff, %%r15" + "\n xorq %%r14, %%rbx" + "\n xorq %%r15, %%rdx" + + "\n movq 128(%0), %%r14" + "\n movq 136(%0), %%r15" + + + "\n # processing input words x[13]=rcx and x[15]=rdx " + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r12" + "\n xorq grsoT1(,%%rsi,8), %%r10" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r14" + "\n xorq grsoT1(,%%rdi,8), %%r12" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT2(,%%rdi,8), %%r8" + "\n pxor grsoT3(,%%rsi,8), %%mm2" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r10" + "\n pxor grsoT3(,%%rdi,8), %%mm4" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq grsoT4(,%%rdi,8), %%r13" + "\n xorq grsoT5(,%%rsi,8), %%r11" + "\n shrq $16, %%rcx" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r15" + "\n xorq grsoT5(,%%rdi,8), %%r13" + "\n shrq $16, %%rdx" + + + + "\n movzbl %%cl, %%edi" + "\n movzbl %%ch, %%esi" + "\n xorq 8(%0), %%rsi" + "\n xorq grsoT6(,%%rdi,8), %%r9" + "\n pxor grsoT7(,%%rsi,8), %%mm7" + "\n movzbl %%dl, %%esi" + "\n movzbl %%dh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n xorq grsoT6(,%%rsi,8), %%r11" + "\n xorq grsoT7(,%%rdi,8), %%r9" + + + + "\n # processing input words x[12]=rax and x[14]=rbx " + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT0(,%%rdi,8), %%r11" + "\n xorq grsoT1(,%%rsi,8), %%r9" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT0(,%%rsi,8), %%r13" + "\n xorq grsoT1(,%%rdi,8), %%r11" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n pxor grsoT2(,%%rdi,8), %%mm7" + "\n pxor grsoT3(,%%rsi,8), %%mm1" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT2(,%%rsi,8), %%r9" + "\n pxor grsoT3(,%%rdi,8), %%mm3" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq grsoT4(,%%rdi,8), %%r12" + "\n xorq grsoT5(,%%rsi,8), %%r10" + "\n shrq $16, %%rax" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq grsoT4(,%%rsi,8), %%r14" + "\n xorq grsoT5(,%%rdi,8), %%r12" + "\n shrq $16, %%rbx" + + + + "\n movzbl %%al, %%edi" + "\n movzbl %%ah, %%esi" + "\n xorq 8(%0), %%rsi" + "\n xorq grsoT6(,%%rdi,8), %%r8" + "\n pxor grsoT7(,%%rsi,8), %%mm6" + "\n movzbl %%bl, %%esi" + "\n movzbl %%bh, %%edi" + "\n xorq 8(%0), %%rdi" + "\n xorq grsoT6(,%%rsi,8), %%r10" + "\n xorq grsoT7(,%%rdi,8), %%r8" + + "\n incq 8(%0) #increment counter" + + "\n movq 8(%0), %%rdi" + "\n cmp $14, %%edi" + "\n je 2f" + "\n movq %%mm1, %%rcx" + "\n movq %%mm3, %%rdx" + "\n movq %%mm0, 0(%0)" + "\n movq %%mm2, 16(%0)" + "\n movq %%mm4, 32(%0)" + "\n movq %%mm5, 40(%0)" + "\n movq %%mm6, 48(%0)" + "\n movq %%mm7, 56(%0)" + "\n movq %%r8 , 64(%0)" + "\n movq %%r9 , 72(%0)" + "\n movq %%r10, 80(%0)" + "\n movq %%r11, 88(%0)" + "\n movq %%r12, 96(%0)" + "\n movq %%r13, 104(%0)" + "\n movq %%r14, 112(%0)" + "\n movq %%r15, 120(%0)" + "\n jmp 1b" + "\n 2:" + "\n movq %%mm0, 0(%0)" + "\n movq %%mm1, 8(%0)" + "\n movq %%mm2, 16(%0)" + "\n movq %%mm3, 24(%0)" + "\n movq %%mm4, 32(%0)" + "\n movq %%mm5, 40(%0)" + "\n movq %%mm6, 48(%0)" + "\n movq %%mm7, 56(%0)" + "\n movq %%r8 , 64(%0)" + "\n movq %%r9 , 72(%0)" + "\n movq %%r10, 80(%0)" + "\n movq %%r11, 88(%0)" + "\n movq %%r12, 96(%0)" + "\n movq %%r13, 104(%0)" + "\n movq %%r14, 112(%0)" + "\n movq %%r15, 120(%0)" + : /*no output, only memory is modified */ + : "r"(x) + : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%rsi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory" , "%mm0", "%mm1", "%mm2" , "%mm3" , "%mm4" , "%mm5" , "%mm6" , "%mm7" ); +}//Q512ASM() + diff --git a/algo/groestl/sse2/grso-asm.h b/algo/groestl/sse2/grso-asm.h new file mode 100644 index 0000000..5323e2a --- /dev/null +++ b/algo/groestl/sse2/grso-asm.h @@ -0,0 +1,10 @@ +#ifndef GRSOASM_H +#define GRSOASM_H + +#include "grso.h" + +void grsoP1024ASM (u64 *x) ; + +void grsoQ1024ASM (u64 *x) ; + +#endif diff --git a/algo/groestl/sse2/grso-asm2.c b/algo/groestl/sse2/grso-asm2.c new file mode 100644 index 0000000..a86afb0 --- /dev/null +++ b/algo/groestl/sse2/grso-asm2.c @@ -0,0 +1,1016 @@ +/* sse4 optimized asm */ +/* not really any faster as most of the time is spend loading up a huge table of 1024 ints + * need to write small lanes groestl with sse loads and partial operations + * could be faster for once block if doing partial transforms on a single block + * without lanes transforms function could break after 64bytes is finished +*/ + +#include "grso-asm.h" + +void grsoP1024ASM(u64 *x) { +asm ( + "\n ### load input state from memory to 16 low halves of XMM registers xmm0...xmm15" + "\n movaps 0(%0), %%xmm0" + "\n movhlps %%xmm0, %%xmm1" + "\n movaps 16(%0), %%xmm2" + "\n movhlps %%xmm2, %%xmm3" + "\n movaps 32(%0), %%xmm4" + "\n movhlps %%xmm4, %%xmm5" + "\n movaps 48(%0), %%xmm6" + "\n movhlps %%xmm6, %%xmm7" + "\n movaps 64(%0), %%xmm8" + "\n movhlps %%xmm8, %%xmm9" + "\n movaps 80(%0), %%xmm10" + "\n movhlps %%xmm10, %%xmm11" + "\n movaps 96(%0), %%xmm12" + "\n movhlps %%xmm12, %%xmm13" + "\n movaps 112(%0), %%xmm14" + "\n movhlps %%xmm14, %%xmm15" + "\n xorq %%rbx, %%rbx" + "\n 1: # beginning of the loop" + + "\n ### process 1st special pair of input words, words x[2], x[11]" + "\n movq %%xmm2, %%rax" + "\n xorq $0x20, %%rax #xor column dependent constant to x[2]" + "\n xorq %%rbx, %%rax #xor round counter" + "\n movq %%xmm11, %%rcx" + "\n shrq $32, %%rcx #no need add constants to x[11] since it's shifted by 32 bits" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n movq grsoT0(,%%rdx,8), %%mm2" + "\n movq grsoT4(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n movq grsoT1(,%%rdx,8), %%mm1" + "\n movq grsoT5(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n movq grsoT2(,%%rdx,8), %%mm0" + "\n movq grsoT6(,%%rdi,8), %%mm5" + "\n shrq $40,%%rax" + "\n movzbl %%al, %%edx" + "\n movzbl %%ch, %%edi" + "\n pxor grsoT7(,%%rdx,8), %%mm7" + "\n pxor grsoT7(,%%rdi,8), %%mm0" + + "\n ### process the third pair of input words, words x[4], x[9]" + "\n movq %%xmm9, %%rcx" + "\n movq %%xmm4, %%rax" + "\n xorq $0x40, %%rax #xor column dependent constant to x[4]" + "\n xorq %%rbx, %%rax #xor round counter" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n movq grsoT0(,%%rdx,8), %%mm4" + "\n pxor grsoT2(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n movq grsoT1(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT4(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm3" + + "\n ### process 2nd special pair of input words, words x[1], x[12]" + "\n movq %%xmm12, %%rcx" + "\n movq %%xmm1, %%rax" + "\n xorq $0x10, %%rax #xor column dependent constant to x[1]" + "\n xorq %%rbx, %%rax #xor round counter" + "\n shrq $40, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n pxor grsoT1(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm6" + "\n shrq $56, %%rax" + "\n shrq $16, %%rcx" + "\n movzbl %%cl, %%edi" + "\n movzbl %%al, %%edx" + "\n pxor grsoT7(,%%rdx,8), %%mm6" + "\n pxor grsoT7(,%%rdi,8), %%mm1" + + "\n ### process the fourth pair of input words, words x[3], x[10]" + "\n movq %%xmm10, %%rcx" + "\n movq %%xmm3, %%rax" + "\n xorq $0x30, %%rax #xor column dependent constant to x[3]" + "\n xorq %%rbx, %%rax #xor round counter" + "\n shrq $24, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT1(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + + "\n ### process 3rd special pair of input words, words x[0], x[13]" + "\n movq %%xmm13, %%rcx" + "\n movq %%xmm0, %%rax" + "\n xorq %%rbx, %%rax #xor round counter to x[0], column dependent const =0" + "\n shrq $48, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm7" + "\n shrq $48, %%rax" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n pxor grsoT7(,%%rdx,8), %%mm5" + "\n pxor grsoT7(,%%rdi,8), %%mm2" + + "\n ### process the second pair of input words, words x[5], x[8]" + "\n movq %%xmm8, %%rcx" + "\n movq %%xmm5, %%rax" + "\n xorq $0x50, %%rax #xor column dependent constant to x[5]" + "\n xorq %%rbx, %%rax #xor round counter to x[5]" + "\n shrq $8, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm5" + "\n pxor grsoT1(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT1(,%%rdx,8), %%mm4" + "\n pxor grsoT2(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT4(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm3" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT5(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm2" + "\n shrq $16, %%rcx" + + "\n ### process 4th special pair of input words, words x[14], x[15]" + "\n movq %%xmm15, %%rcx" + "\n movq %%xmm14, %%rax" + "\n shrq $56, %%rcx" + "\n shrq $56, %%rax" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT7(,%%rdx,8), %%mm3" + "\n pxor grsoT7(,%%rdi,8), %%mm4" + + "\n ### process the first pair of input words, words x[6], x[7]" + "\n movq %%xmm6, %%rax" + "\n movq %%xmm7, %%rcx" + "\n xorq $0x60, %%rax #xor column dependent constant to x[6]" + "\n xorq $0x70, %%rcx #xor column dependent constant to x[7]" + "\n xorq %%rbx, %%rax #xor round counter to x[6]" + "\n xorq %%rbx, %%rcx #xor round counter to x[7]" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm6" + "\n pxor grsoT0(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT1(,%%rdx,8), %%mm5" + "\n pxor grsoT1(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm4" + "\n pxor grsoT2(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT4(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm3" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT5(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm2" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT6(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm1" + + "\n ### writes contents of MM0..MM7 to memory " + "\n movq %%mm7, 56(%0)" + "\n movq %%mm6, 48(%0)" + "\n movq %%mm5, 40(%0)" + "\n movq %%mm4, 32(%0)" + "\n movq %%mm3, 24(%0)" + "\n movq %%mm2, 16(%0)" + "\n movq %%mm1, 8(%0)" + "\n movq %%mm0, 0(%0)" + "\n #use the remaining data in ah, ch to process" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n movq grsoT7(,%%rdx,8), %%mm3" + "\n movq grsoT7(,%%rdi,8), %%mm4" + + "\n ### process the first pair of input words, words x[14], x[15]" + "\n movq %%xmm14, %%rax" + "\n movq %%xmm15, %%rcx" + "\n xorq $0xe0, %%rax #xor column dependent constant to x[14]" + "\n xorq $0xf0, %%rcx #xor column dependent constant to x[15]" + "\n xorq %%rbx, %%rax #xor round counter to x[14]" + "\n xorq %%rbx, %%rcx #xor round counter to x[15]" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n movq grsoT0(,%%rdx,8), %%mm6" + "\n movq grsoT0(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n movq grsoT1(,%%rdx,8), %%mm5" + "\n pxor grsoT1(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm4" + "\n pxor grsoT2(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n movq grsoT4(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm3" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n movq grsoT5(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm2" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n movq grsoT6(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm1" + + "\n ### process 3rd special pair of input words, words x[8], x[5]" + "\n movq %%xmm5, %%rcx" + "\n movq %%xmm8, %%rax" + "\n xorq $0x80, %%rax #xor column dependent constant to x[8]" + "\n xorq %%rbx, %%rax #xor round counter" + "\n shrq $48, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm7" + "\n shrq $48, %%rax" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n pxor grsoT7(,%%rdx,8), %%mm5" + "\n pxor grsoT7(,%%rdi,8), %%mm2" + + "\n ### process the second pair of input words, words x[13], x[0]" + "\n movq %%xmm0, %%rcx" + "\n movq %%xmm13, %%rax" + "\n xorq $0xd0, %%rax #xor column dependent constant to x[13]" + "\n xorq %%rbx, %%rax #xor round counter" + "\n shrq $8, %%rcx #no column constant and after shift no round counter either" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm5" + "\n pxor grsoT1(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT1(,%%rdx,8), %%mm4" + "\n pxor grsoT2(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT4(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm3" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT5(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm2" + "\n shrq $16, %%rcx" + + "\n ### process the third pair of input words, words x[12], x[1]" + "\n movq %%xmm1, %%rcx" + "\n movq %%xmm12, %%rax" + "\n xorq $0xc0, %%rax #xor column dependent constant to x[12]" + "\n xorq %%rbx, %%rax #xor round counter to x[12]" + "\n shrq $16, %%rcx #constant disappears after shift" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm4" + "\n pxor grsoT2(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT1(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT4(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm3" + + "\n ### process 2nd special pair of input words, words x[9], x[4]" + "\n movq %%xmm4, %%rcx" + "\n movq %%xmm9, %%rax" + "\n xorq $0x90, %%rax #xor round dependent constant to x[9]" + "\n xorq %%rbx, %%rax #xor round counter to x[9]" + "\n shrq $40, %%rcx #constant disappears after shift in x[4]" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n pxor grsoT1(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm6" + "\n shrq $56, %%rax" + "\n shrq $16, %%rcx" + "\n movzbl %%cl, %%edi" + "\n movzbl %%al, %%edx" + "\n pxor grsoT7(,%%rdx,8), %%mm6" + "\n pxor grsoT7(,%%rdi,8), %%mm1" + + "\n ### process the fourth pair of input words, words x[11], x[2]" + "\n movq %%xmm2, %%rcx" + "\n movq %%xmm11, %%rax" + "\n xorq $0xb0, %%rax #xor column dependent constant to x[11]" + "\n xorq %%rbx, %%rax #xor round counter to x[11]" + "\n shrq $24, %%rcx #constants disappear after shift in x[2]" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm3" + "\n pxor grsoT3(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT1(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm5" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT3(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm4" + "\n shrq $16, %%rcx" + + "\n ### process 1st special pair of input words, words x[10], x[3]" + "\n movq %%xmm10, %%rax" + "\n movq %%xmm3, %%rcx" + "\n xorq $0xa0, %%rax #xor column dependent constant" + "\n xorq %%rbx, %%rax #xor round counter" + "\n shrq $32, %%rcx #constants disappear after shift" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT0(,%%rdx,8), %%mm2" + "\n pxor grsoT4(,%%rdi,8), %%mm7" + "\n movzbl %%ah, %%edx" + "\n movzbl %%ch, %%edi" + "\n shrq $16, %%rax" + "\n pxor grsoT1(,%%rdx,8), %%mm1" + "\n pxor grsoT5(,%%rdi,8), %%mm6" + "\n shrq $16, %%rcx" + "\n movzbl %%al, %%edx" + "\n movzbl %%cl, %%edi" + "\n pxor grsoT2(,%%rdx,8), %%mm0" + "\n pxor grsoT6(,%%rdi,8), %%mm5" + "\n shrq $40,%%rax" + "\n movzbl %%al, %%edx" + "\n movzbl %%ch, %%edi" + "\n pxor grsoT7(,%%rdx,8), %%mm7" + "\n pxor grsoT7(,%%rdi,8), %%mm0" + + "\n incq %%rbx" + "\n cmp $14, %%rbx" + "\n je 2f" + + + "\n ### move 8 MMX registers to low halves of XMM registers" + "\n movq2dq %%mm0, %%xmm8" + "\n movq2dq %%mm1, %%xmm9" + "\n movq2dq %%mm2, %%xmm10" + "\n movq2dq %%mm3, %%xmm11" + "\n movq2dq %%mm4, %%xmm12" + "\n movq2dq %%mm5, %%xmm13" + "\n movq2dq %%mm6, %%xmm14" + "\n movq2dq %%mm7, %%xmm15" + + "\n ### read back 8 words of input state from memory to 8 low halves of XMM registers xmm0...xmm15" + "\n movaps 0(%0), %%xmm0" + "\n movhlps %%xmm0, %%xmm1" + "\n movaps 16(%0), %%xmm2" + "\n movhlps %%xmm2, %%xmm3" + "\n movaps 32(%0), %%xmm4" + "\n movhlps %%xmm4, %%xmm5" + "\n movaps 48(%0), %%xmm6" + "\n movhlps %%xmm6, %%xmm7" + "\n jmp 1b" + + "\n 2: # finalization" + + "\n ### writes contents of MM0..MM7 to memory " + "\n movq %%mm7, 120(%0)" + "\n movq %%mm6, 112(%0)" + "\n movq %%mm5, 104(%0)" + "\n movq %%mm4, 96(%0)" + "\n movq %%mm3, 88(%0)" + "\n movq %%mm2, 80(%0)" + "\n movq %%mm1, 72(%0)" + "\n movq %%mm0, 64(%0)" +: /*no output, only memory is modifed */ +: "r"(x) +: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" , "%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , "%xmm8" , "%xmm9" , "%xmm10" , "%xmm11" , "%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" ); +}//P1024ASM() + +void grsoQ1024ASM(u64 *x) { +asm ( + + "\n ### load input state from memory to 16 low halves of XMM registers xmm0...xmm15" + "\n movaps 0(%0), %%xmm0" + "\n movhlps %%xmm0, %%xmm1" + "\n movaps 16(%0), %%xmm2" + "\n movhlps %%xmm2, %%xmm3" + "\n movaps 32(%0), %%xmm4" + "\n movhlps %%xmm4, %%xmm5" + "\n movaps 48(%0), %%xmm6" + "\n movhlps %%xmm6, %%xmm7" + "\n movaps 64(%0), %%xmm8" + "\n movhlps %%xmm8, %%xmm9" + "\n movaps 80(%0), %%xmm10" + "\n movhlps %%xmm10, %%xmm11" + "\n movaps 96(%0), %%xmm12" + "\n movhlps %%xmm12, %%xmm13" + "\n movaps 112(%0), %%xmm14" + "\n movhlps %%xmm14, %%xmm15" + "\n xorl %%ebx, %%ebx" + "\n 1: # beginning of the loop" + + "\n ### load a pair of input words x[7], x[8] to process them" + "\n movq %%xmm7, %%rax #rax = [ x[7].0, x[7].1, x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7 ]" + "\n movq %%xmm8, %%rcx #rcx = [ x[8].0, x[8].1, x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7 ]" + "\n # xor column constants by xoring 0xfff...ff first and later xoring 0xi0 ^ r to bytes that need that" + "\n notq %%rax" + "\n notq %%rcx" + "\n # now we have free register xmm7 which we can use to XOR 0xfff..ff to the remaining ones" + "\n pcmpeqw %%xmm7, %%xmm7 #create mask of all ones in xmm7" + "\n pxor %%xmm7, %%xmm0" + "\n pxor %%xmm7, %%xmm1" + "\n pxor %%xmm7, %%xmm2" + "\n pxor %%xmm7, %%xmm3" + "\n pxor %%xmm7, %%xmm4" + "\n pxor %%xmm7, %%xmm5" + "\n pxor %%xmm7, %%xmm6" + "\n pxor %%xmm7, %%xmm8" + "\n pxor %%xmm7, %%xmm9" + "\n pxor %%xmm7, %%xmm10" + "\n pxor %%xmm7, %%xmm11" + "\n pxor %%xmm7, %%xmm12" + "\n pxor %%xmm7, %%xmm13" + "\n pxor %%xmm7, %%xmm14" + "\n pxor %%xmm7, %%xmm15" + "\n movq %%rax, %%xmm7 #restore orignal value of xmm7 for later" + "\n movzbl %%al, %%edx #edx = x[7].0" + "\n movzbl %%cl, %%edi #edi = x[8].0" + "\n movq grsoT0(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]" + "\n movq grsoT0(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]" + "\n movzbl %%ah, %%edx #edx = x[7].1" + "\n movzbl %%ch, %%edi #edi = x[8].1" + "\n movq grsoT1(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]" + "\n movq grsoT1(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]" + "\n shrq $16, %%rax #rax = [ x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[7].2" + "\n movzbl %%cl, %%edi #edi = x[8].2" + "\n movq grsoT2(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]" + "\n movq grsoT2(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]" + "\n shrq $16, %%rax #rax = [ x[7].4, x[7].5, x[7].6, x[7].7, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[8].4, x[8].5, x[8].6, x[8].7, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[7].4" + "\n pxor grsoT4(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]" + "\n movzbl %%ah, %%edx #edx = x[7].5" + "\n movzbl %%ch, %%edi #edi = x[8].5" + "\n pxor grsoT5(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]" + "\n pxor grsoT5(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]" + "\n shrq $16, %%rax #rax = [ x[7].6, x[7].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[8].6, x[8].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[7].6" + "\n movzbl %%cl, %%edi #edi = x[8].6" + "\n pxor grsoT6(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]" + "\n pxor grsoT6(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]" + "\n movzbl %%ah, %%edx #edx = x[7].7" + "\n movzbl %%ch, %%edi #edi = x[8].7" + "\n xorl $0x70, %%edx #xor column dependent part of const" + "\n xorl $0x80, %%edi #xor column dependent part of const" + "\n xorl %%ebx, %%edx #xor round counter" + "\n xorl %%ebx, %%edi #xor round counter" + "\n movq grsoT7(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]" + "\n pxor grsoT7(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]" + + "\n ### load a pair of input words x[13], x[14] and process them" + "\n movq %%xmm13, %%rax #rax = [ x[13].0, x[13].1, x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7 ]" + "\n movq %%xmm14, %%rcx #rcx = [ x[14].0, x[14].1, x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7 ]" + "\n shrq $24, %%rax #rax = [ x[13].3, x[13].4, x[13].5, x[13].6, x[13].7, 0, 0, 0 ]" + "\n shrq $24, %%rcx #rcx = [ x[14].3, x[14].4, x[14].5, x[14].6, x[14].7, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[13].3" + "\n movzbl %%cl, %%edi #edi = x[14].3" + "\n pxor grsoT3(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]^grsoT4[x[2].4]^grsoT3[x[13].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]^grsoT7[x[9].7]^grsoT3[x[14].3]" + "\n shrq $32, %%rax #rax = [ x[13].7, 0, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[13].7" + "\n xorl $0xd0, %%edx #xor column constant" + "\n xorl %%ebx, %%edx #xor round counter" + "\n pxor grsoT7(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]^grsoT6[x[11].6]^grsoT7[x[13].7]" + + "\n ### load a pair of input words x[5], x[6] and process them" + "\n movq %%xmm5, %%rax #rax = [ x[5].0, x[5].1, x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7 ]" + "\n movq %%xmm6, %%rcx #rcx = [ x[6].0, x[6].1, x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7 ]" + "\n movzbl %%al, %%edx #edx = x[5].0" + "\n movzbl %%cl, %%edi #edi = x[6].0" + "\n pxor grsoT0(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]" + "\n pxor grsoT0(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]" + "\n movzbl %%ah, %%edx #edx = x[5].1" + "\n movzbl %%ch, %%edi #edi = x[6].1" + "\n pxor grsoT1(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]" + "\n pxor grsoT1(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]" + "\n shrq $16, %%rax #rax = [ x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[5].2" + "\n movzbl %%cl, %%edi #edi = x[6].2" + "\n movq grsoT2(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]" + "\n pxor grsoT2(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]" + "\n shrq $16, %%rax #rax = [ x[5].4, x[5].5, x[5].6, x[5].7, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[6].4, x[6].5, x[6].6, x[6].7, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[5].4" + "\n movzbl %%cl, %%edi #edi = x[6].4" + "\n pxor grsoT4(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]" + "\n pxor grsoT4(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]" + "\n movzbl %%ah, %%edx #edx = x[5].5" + "\n movzbl %%ch, %%edi #edi = x[6].5" + "\n pxor grsoT5(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]" + "\n pxor grsoT5(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]" + "\n shrq $16, %%rax #rax = [ x[5].6, x[5].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[6].6, x[6].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[5].6" + "\n movzbl %%cl, %%edi #edi = x[6].6" + "\n pxor grsoT6(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]" + "\n pxor grsoT6(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]" + "\n movzbl %%ch, %%edi #edi = x[6].7" + "\n xorl $0x60, %%edi #xor column dependent part of const" + "\n xorl %%ebx, %%edi #xor round conter" + "\n pxor grsoT7(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]" + + "\n ### load a pair of input words x[15], x[0] and process them" + "\n movq %%xmm15, %%rax #rax = [ x[15].0, x[15].1, x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7 ]" + "\n movq %%xmm0, %%rcx #rcx = [ x[0].0, x[0].1, x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7 ]" + "\n shrq $24, %%rax #rax = [ x[15].3, x[15].4, x[15].5, x[15].6, x[15].7, 0, 0, 0 ]" + "\n shrq $24, %%rcx #rcx = [ x[0].3, x[0].4, x[0].5, x[0].6, x[0].7, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[15].3" + "\n movzbl %%cl, %%edi #edi = x[0].3" + "\n pxor grsoT3(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]^grsoT7[x[10].7]^grsoT3[x[15].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]^grsoT7[x[11].7]^grsoT3[x[0].3]" + "\n movzbl %%ch, %%edi #edi = x[0].4" + "\n pxor grsoT4(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]^grsoT3[x[11].3]^grsoT4[x[0].4]" + + "\n ### load a pair of input words x[3], x[4] and process them" + "\n movq %%xmm3, %%rax #rax = [ x[3].0, x[3].1, x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7 ]" + "\n movq %%xmm4, %%rcx #rcx = [ x[4].0, x[4].1, x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7 ]" + "\n movzbl %%al, %%edx #edx = x[3].0" + "\n movzbl %%cl, %%edi #edi = x[4].0" + "\n pxor grsoT0(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]" + "\n pxor grsoT0(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]" + "\n movzbl %%ah, %%edx #edx = x[3].1" + "\n movzbl %%ch, %%edi #edi = x[4].1" + "\n pxor grsoT1(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]" + "\n pxor grsoT1(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]" + "\n shrq $32, %%rax #rax = [ x[3].4, x[3].5, x[3].6, x[3].7, 0, 0, 0, 0 ]" + "\n shrq $32, %%rcx #rcx = [ x[4].4, x[4].5, x[4].6, x[4].7, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[3].4" + "\n movzbl %%cl, %%edi #edi = x[4].4" + "\n pxor grsoT4(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]" + "\n pxor grsoT4(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]" + "\n movzbl %%ah, %%edx #edx = x[3].5" + "\n movzbl %%ch, %%edi #edi = x[4].5" + "\n pxor grsoT5(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]" + "\n pxor grsoT5(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]" + "\n shrq $16, %%rcx #rcx = [ x[4].6, x[4].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%cl, %%edi #edi = x[4].6" + "\n pxor grsoT6(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]" + + "\n ### load a pair of input words x[1], x[2] and process them" + "\n movq %%xmm1, %%rax #rax = [ x[1].0, x[1].1, x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7 ]" + "\n movq %%xmm2, %%rcx #rcx = [ x[2].0, x[2].1, x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7 ]" + "\n movzbl %%al, %%edx #edx = x[1].0" + "\n movzbl %%cl, %%edi #edi = x[2].0" + "\n pxor grsoT0(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]" + "\n pxor grsoT0(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]" + "\n shrq $24, %%rax #rax = [ x[1].3, x[1].4, x[1].5, x[1].6, x[1].7, 0, 0, 0 ]" + "\n shrq $24, %%rcx #rcx = [ x[2].3, x[2].4, x[2].5, x[2].6, x[2].7, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[1].3" + "\n movzbl %%cl, %%edi #edi = x[2].3" + "\n pxor grsoT3(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]" + "\n movzbl %%ah, %%edx #edx = x[1].4" + "\n movzbl %%ch, %%edi #edi = x[2].4" + "\n pxor grsoT4(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]^grsoT4[x[1].4]" + "\n pxor grsoT4(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]^grsoT4[x[2].4]" + "\n shrq $16, %%rcx #rcx = [ x[2].5, x[2].6, x[2].7, 0, 0, 0, 0, 0 ]" + "\n movzbl %%cl, %%edi #edi = x[2].5" + "\n pxor grsoT5(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]" + + "\n ### load a pair of input words x[9], x[10] and process them" + "\n movq %%xmm9, %%rax #rax = [ x[9].0, x[9].1, x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7 ]" + "\n movq %%xmm10, %%rcx #rcx = [ x[10].0, x[10].1, x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7 ]" + "\n movzbl %%ah, %%edx #edx = x[9].1" + "\n movzbl %%ch, %%edi #edi = x[10].1" + "\n pxor grsoT1(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]" + "\n pxor grsoT1(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]" + "\n shrq $16, %%rax #rax = [ x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[9].2" + "\n movzbl %%cl, %%edi #edi = x[10].2" + "\n pxor grsoT2(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]" + "\n pxor grsoT2(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]" + "\n shrq $24, %%rax #rax = [ x[9].5, x[9].6, x[9].7, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[9].5" + "\n pxor grsoT5(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]" + "\n shrq $8, %%rax #rax = [ x[9].6, x[9].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $32, %%rcx #rcx = [ x[10].6, x[10].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[9].6" + "\n movzbl %%cl, %%edi #edi = x[10].6" + "\n pxor grsoT6(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]" + "\n pxor grsoT6(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]" + "\n movzbl %%ah, %%edx #edx = x[9].7" + "\n movzbl %%ch, %%edi #edi = x[10].7" + "\n xorl $0x90, %%edx #xor column constant" + "\n xorl $0xa0, %%edi #xor column constant" + "\n xorl %%ebx, %%edx #xor round counter" + "\n xorl %%ebx, %%edi #xor round counter" + "\n pxor grsoT7(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]^grsoT7[x[9].7]" + "\n pxor grsoT7(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]^grsoT7[x[10].7]" + + "\n ### load a pair of input words x[11], x[12] and process them" + "\n movq %%xmm11, %%rax #rax = [ x[11].0, x[11].1, x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7 ]" + "\n movq %%xmm12, %%rcx #rcx = [ x[12].0, x[12].1, x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7 ]" + "\n shrq $16, %%rax #rax = [ x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[11].2" + "\n movzbl %%cl, %%edi #edi = x[12].2" + "\n pxor grsoT2(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]^grsoT2[x[11].2]" + "\n pxor grsoT2(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]" + "\n movzbl %%ah, %%edx #edx = x[11].3" + "\n movzbl %%ch, %%edi #edi = x[12].3" + "\n pxor grsoT3(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]^grsoT3[x[11].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]^grsoT4[x[1].4]^grsoT3[x[12].3]" + "\n shrq $32, %%rax #rax = [ x[11].6, x[11].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $32, %%rcx #rcx = [ x[12].6, x[12].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[11].6" + "\n pxor grsoT6(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]^grsoT6[x[11].6]" + "\n movzbl %%ah, %%edx #edx = x[11].7" + "\n movzbl %%ch, %%edi #edi = x[12].7" + "\n xorl $0xb0, %%edx #xor column constant" + "\n xorl $0xc0, %%edi #xor column constant" + "\n xorl %%ebx, %%edx #xor round counter" + "\n xorl %%ebx, %%edi #xor round counter" + "\n pxor grsoT7(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]^grsoT7[x[11].7]" + "\n pxor grsoT7(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]^grsoT2[x[11].2]^grsoT7[x[12].7]" + + + "\n ### writes contents of MM0..MM7 to memory " + "\n movq %%mm0, 0(%0)" + "\n movq %%mm1, 8(%0)" + "\n movq %%mm2, 16(%0)" + "\n movq %%mm3, 24(%0)" + "\n movq %%mm4, 32(%0)" + "\n movq %%mm5, 40(%0)" + "\n movq %%mm6, 48(%0)" + "\n movq %%mm7, 56(%0)" + + "\n ### load a pair of input words x[15], x[0] and process them" + "\n movq %%xmm15, %%rax #rax = [ x[15].0, x[15].1, x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7 ]" + "\n movq %%xmm0, %%rcx #rcx = [ x[0].0, x[0].1, x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7 ]" + "\n movzbl %%al, %%edx #edx = x[15].0" + "\n movzbl %%cl, %%edi #edi = x[0].0" + "\n movq grsoT0(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]" + "\n movq grsoT0(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]" + "\n movzbl %%ah, %%edx #edx = x[15].1" + "\n movzbl %%ch, %%edi #edi = x[0].1" + "\n movq grsoT1(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]" + "\n movq grsoT1(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]" + "\n shrq $16, %%rax #rax = [ x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[15].2" + "\n movzbl %%cl, %%edi #edi = x[0].2" + "\n movq grsoT2(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]" + "\n movq grsoT2(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]" + "\n shrq $16, %%rax #rax = [ x[15].4, x[15].5, x[15].6, x[15].7, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[0].4, x[0].5, x[0].6, x[0].7, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[15].4" + "\n pxor grsoT4(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]" + "\n movzbl %%ah, %%edx #edx = x[15].5" + "\n movzbl %%ch, %%edi #edi = x[0].5" + "\n pxor grsoT5(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]" + "\n pxor grsoT5(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]" + "\n shrq $16, %%rax #rax = [ x[15].6, x[15].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[0].6, x[0].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[15].6" + "\n movzbl %%cl, %%edi #edi = x[0].6" + "\n pxor grsoT6(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]" + "\n pxor grsoT6(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]" + "\n movzbl %%ah, %%edx #edx = x[15].7" + "\n movzbl %%ch, %%edi #edi = x[0].7" + "\n xorl $0xf0, %%edx #xor column dependent part of const" + "\n xorl $0x00, %%edi #xor column dependent part of const" + "\n xorl %%ebx, %%edx #xor round counter" + "\n xorl %%ebx, %%edi #xor round counter" + "\n movq grsoT7(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]" + "\n pxor grsoT7(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]" + + "\n ### load a pair of input words x[5], x[6] and process them" + "\n movq %%xmm5, %%rax #rax = [ x[5].0, x[5].1, x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7 ]" + "\n movq %%xmm6, %%rcx #rcx = [ x[6].0, x[6].1, x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7 ]" + "\n shrq $24, %%rax #rax = [ x[5].3, x[5].4, x[5].5, x[5].6, x[5].7, 0, 0, 0 ]" + "\n shrq $24, %%rcx #rcx = [ x[6].3, x[6].4, x[6].5, x[6].6, x[6].7, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[5].3" + "\n movzbl %%cl, %%edi #edi = x[6].3" + "\n pxor grsoT3(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]^grsoT4[x[10].4]^grsoT3[x[5].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]^grsoT7[x[1].7]^grsoT3[x[6].3]" + "\n shrq $32, %%rax #rax = [ x[5].7, 0, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[5].7" + "\n xorl $0x50, %%edx #xor column constant" + "\n xorl %%ebx, %%edx #xor round counter" + "\n pxor grsoT7(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]^grsoT6[x[3].6]^grsoT7[x[5].7]" + + + "\n ### load a pair of input words x[13], x[14] and process them" + "\n movq %%xmm13, %%rax #rax = [ x[13].0, x[13].1, x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7 ]" + "\n movq %%xmm14, %%rcx #rcx = [ x[14].0, x[14].1, x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7 ]" + "\n movzbl %%al, %%edx #edx = x[13].0" + "\n movzbl %%cl, %%edi #edi = x[14].0" + "\n pxor grsoT0(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]" + "\n pxor grsoT0(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]" + "\n movzbl %%ah, %%edx #edx = x[13].1" + "\n movzbl %%ch, %%edi #edi = x[14].1" + "\n pxor grsoT1(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]" + "\n pxor grsoT1(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]" + "\n shrq $16, %%rax #rax = [ x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[13].2" + "\n movzbl %%cl, %%edi #edi = x[14].2" + "\n movq grsoT2(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]" + "\n pxor grsoT2(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]" + "\n shrq $16, %%rax #rax = [ x[13].4, x[13].5, x[13].6, x[13].7, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[14].4, x[14].5, x[14].6, x[14].7, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[13].4" + "\n movzbl %%cl, %%edi #edi = x[14].4" + "\n pxor grsoT4(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]" + "\n pxor grsoT4(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]" + "\n movzbl %%ah, %%edx #edx = x[13].5" + "\n movzbl %%ch, %%edi #edi = x[14].5" + "\n pxor grsoT5(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]" + "\n pxor grsoT5(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]" + "\n shrq $16, %%rax #rax = [ x[13].6, x[13].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[14].6, x[14].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[13].6" + "\n movzbl %%cl, %%edi #edi = x[14].6" + "\n pxor grsoT6(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]" + "\n pxor grsoT6(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]" + "\n movzbl %%ch, %%edi #edi = x[14].7" + "\n xorl $0xe0, %%edi #xor column dependent part of const" + "\n xorl %%ebx, %%edi #xor round conter" + "\n pxor grsoT7(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]" + + "\n ### load a pair of input words x[7], x[8] and process them" + "\n movq %%xmm7, %%rax #rax = [ x[7].0, x[7].1, x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7 ]" + "\n movq %%xmm8, %%rcx #rcx = [ x[8].0, x[8].1, x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7 ]" + "\n shrq $24, %%rax #rax = [ x[7].3, x[7].4, x[7].5, x[7].6, x[7].7, 0, 0, 0 ]" + "\n shrq $24, %%rcx #rcx = [ x[8].3, x[8].4, x[8].5, x[8].6, x[8].7, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[7].3" + "\n movzbl %%cl, %%edi #edi = x[8].3" + "\n pxor grsoT3(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]^grsoT7[x[2].7]^grsoT3[x[7].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]^grsoT7[x[3].7]^grsoT3[x[8].3]" + "\n movzbl %%ch, %%edi #edi = x[8].4" + "\n pxor grsoT4(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]^grsoT3[x[3].3]^grsoT4[x[8].4]" + + + "\n ### load a pair of input words x[11], x[12] and process them" + "\n movq %%xmm11, %%rax #rax = [ x[11].0, x[11].1, x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7 ]" + "\n movq %%xmm12, %%rcx #rcx = [ x[12].0, x[12].1, x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7 ]" + "\n movzbl %%al, %%edx #edx = x[11].0" + "\n movzbl %%cl, %%edi #edi = x[12].0" + "\n pxor grsoT0(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]" + "\n pxor grsoT0(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]" + "\n movzbl %%ah, %%edx #edx = x[11].1" + "\n movzbl %%ch, %%edi #edi = x[12].1" + "\n pxor grsoT1(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]" + "\n pxor grsoT1(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]" + "\n shrq $32, %%rax #rax = [ x[11].4, x[11].5, x[11].6, x[11].7, 0, 0, 0, 0 ]" + "\n shrq $32, %%rcx #rcx = [ x[12].4, x[12].5, x[12].6, x[12].7, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[11].4" + "\n movzbl %%cl, %%edi #edi = x[12].4" + "\n pxor grsoT4(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]" + "\n pxor grsoT4(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]" + "\n movzbl %%ah, %%edx #edx = x[11].5" + "\n movzbl %%ch, %%edi #edi = x[12].5" + "\n pxor grsoT5(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]" + "\n pxor grsoT5(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]" + "\n shrq $16, %%rcx #rcx = [ x[12].6, x[12].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%cl, %%edi #edi = x[12].6" + "\n pxor grsoT6(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]" + + "\n ### load a pair of input words x[9], x[10] and process them" + "\n movq %%xmm9, %%rax #rax = [ x[9].0, x[9].1, x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7 ]" + "\n movq %%xmm10, %%rcx #rcx = [ x[10].0, x[10].1, x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7 ]" + "\n movzbl %%al, %%edx #edx = x[9].0" + "\n movzbl %%cl, %%edi #edi = x[10].0" + "\n pxor grsoT0(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]" + "\n pxor grsoT0(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]" + "\n shrq $24, %%rax #rax = [ x[9].3, x[9].4, x[9].5, x[9].6, x[9].7, 0, 0, 0 ]" + "\n shrq $24, %%rcx #rcx = [ x[10].3, x[10].4, x[10].5, x[10].6, x[10].7, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[9].3" + "\n movzbl %%cl, %%edi #edi = x[10].3" + "\n pxor grsoT3(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]" + "\n movzbl %%ah, %%edx #edx = x[9].4" + "\n movzbl %%ch, %%edi #edi = x[10].4" + "\n pxor grsoT4(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]^grsoT4[x[9].4]" + "\n pxor grsoT4(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]^grsoT4[x[10].4]" + "\n shrq $16, %%rcx #rcx = [ x[10].5, x[10].6, x[10].7, 0, 0, 0, 0, 0 ]" + "\n movzbl %%cl, %%edi #edi = x[10].5" + "\n pxor grsoT5(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]" + + "\n ### load a pair of input words x[1], x[2] and process them" + "\n movq %%xmm1, %%rax #rax = [ x[1].0, x[1].1, x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7 ]" + "\n movq %%xmm2, %%rcx #rcx = [ x[2].0, x[2].1, x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7 ]" + "\n movzbl %%ah, %%edx #edx = x[1].1" + "\n movzbl %%ch, %%edi #edi = x[2].1" + "\n pxor grsoT1(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]" + "\n pxor grsoT1(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]" + "\n shrq $16, %%rax #rax = [ x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[1].2" + "\n movzbl %%cl, %%edi #edi = x[2].2" + "\n pxor grsoT2(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]" + "\n pxor grsoT2(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]" + "\n shrq $24, %%rax #rax = [ x[1].5, x[1].6, x[1].7, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[1].5" + "\n pxor grsoT5(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]" + "\n shrq $8, %%rax #rax = [ x[1].6, x[1].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $32, %%rcx #rcx = [ x[2].6, x[2].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[1].6" + "\n movzbl %%cl, %%edi #edi = x[2].6" + "\n pxor grsoT6(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]" + "\n pxor grsoT6(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]" + "\n movzbl %%ah, %%edx #edx = x[1].7" + "\n movzbl %%ch, %%edi #edi = x[2].7" + "\n xorl $0x10, %%edx #xor column constant" + "\n xorl $0x20, %%edi #xor column constant" + "\n xorl %%ebx, %%edx #xor round counter" + "\n xorl %%ebx, %%edi #xor round counter" + "\n pxor grsoT7(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]^grsoT7[x[1].7]" + "\n pxor grsoT7(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]^grsoT7[x[2].7]" + + "\n ### load a pair of input words x[3], x[4] and process them" + "\n movq %%xmm3, %%rax #rax = [ x[3].0, x[3].1, x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7 ]" + "\n movq %%xmm4, %%rcx #rcx = [ x[4].0, x[4].1, x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7 ]" + "\n shrq $16, %%rax #rax = [ x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7, 0, 0 ]" + "\n shrq $16, %%rcx #rcx = [ x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[3].2" + "\n movzbl %%cl, %%edi #edi = x[4].2" + "\n pxor grsoT2(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]^grsoT2[x[3].2]" + "\n pxor grsoT2(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]" + "\n movzbl %%ah, %%edx #edx = x[3].3" + "\n movzbl %%ch, %%edi #edi = x[4].3" + "\n pxor grsoT3(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]^grsoT3[x[3].3]" + "\n pxor grsoT3(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]^grsoT4[x[9].4]^grsoT3[x[4].3]" + "\n shrq $32, %%rax #rax = [ x[3].6, x[3].7, 0, 0, 0, 0, 0, 0 ]" + "\n shrq $32, %%rcx #rcx = [ x[4].6, x[4].7, 0, 0, 0, 0, 0, 0 ]" + "\n movzbl %%al, %%edx #edx = x[3].6" + "\n pxor grsoT6(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]^grsoT6[x[3].6]" + "\n movzbl %%ah, %%edx #edx = x[3].7" + "\n movzbl %%ch, %%edi #edi = x[4].7" + "\n xorl $0x30, %%edx #xor column constant" + "\n xorl $0x40, %%edi #xor column constant" + "\n xorl %%ebx, %%edx #xor round counter" + "\n xorl %%ebx, %%edi #xor round counter" + "\n pxor grsoT7(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]^grsoT7[x[3].7]" + "\n pxor grsoT7(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]^grsoT2[x[3].2]^grsoT7[x[4].7]" + + + "\n incl %%ebx" + "\n cmp $14, %%ebx" + "\n je 2f" + + + "\n ### move 8 MMX registers to low halves of XMM registers" + "\n movq2dq %%mm0, %%xmm8" + "\n movq2dq %%mm1, %%xmm9" + "\n movq2dq %%mm2, %%xmm10" + "\n movq2dq %%mm3, %%xmm11" + "\n movq2dq %%mm4, %%xmm12" + "\n movq2dq %%mm5, %%xmm13" + "\n movq2dq %%mm6, %%xmm14" + "\n movq2dq %%mm7, %%xmm15" + + "\n ### read back 8 words of input state from memory to 8 low halves of XMM registers xmm0...xmm15" + "\n movaps 0(%0), %%xmm0" + "\n movhlps %%xmm0, %%xmm1" + "\n movaps 16(%0), %%xmm2" + "\n movhlps %%xmm2, %%xmm3" + "\n movaps 32(%0), %%xmm4" + "\n movhlps %%xmm4, %%xmm5" + "\n movaps 48(%0), %%xmm6" + "\n movhlps %%xmm6, %%xmm7" + "\n jmp 1b" + + "\n 2: # finalization" + + "\n ### writes contents of MM0..MM7 to memory " + "\n movq %%mm0, 64(%0)" + "\n movq %%mm1, 72(%0)" + "\n movq %%mm2, 80(%0)" + "\n movq %%mm3, 88(%0)" + "\n movq %%mm4, 96(%0)" + "\n movq %%mm5, 104(%0)" + "\n movq %%mm6, 112(%0)" + "\n movq %%mm7, 120(%0)" +: /*no output, only memory is modifed */ +: "r"(x) +: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" , "%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , "%xmm8" , "%xmm9" , "%xmm10" , "%xmm11" , "%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" ); + + +}//Q1024ASM() + + diff --git a/algo/groestl/sse2/grso-asm2.h b/algo/groestl/sse2/grso-asm2.h new file mode 100644 index 0000000..56afbdc --- /dev/null +++ b/algo/groestl/sse2/grso-asm2.h @@ -0,0 +1,11 @@ +#ifndef GRSOASM_H +#define GRSOASM_H +/* really same as the mmx asm.h */ +/* made just in case something must be changed */ +#include "grso.h" + +void grsoP1024ASM (u64 *x) ; + +void grsoQ1024ASM (u64 *x) ; + +#endif diff --git a/algo/groestl/sse2/grso-macro.c b/algo/groestl/sse2/grso-macro.c new file mode 100644 index 0000000..9652620 --- /dev/null +++ b/algo/groestl/sse2/grso-macro.c @@ -0,0 +1,110 @@ +/* hash.c January 2011 + * + * Groestl-512 implementation with inline assembly containing mmx and + * sse instructions. Optimized for Opteron. + * Authors: Krystian Matusiewicz and Soeren S. Thomsen + * + * This code is placed in the public domain + */ + +//#include "grso.h" +//#include "grso-asm.h" +// #include "grsotab.h" + +#define DECL_GRS + +/* load initial constants */ +#define GRS_I \ +do { \ + int i; \ + /* set initial value */ \ + for (i = 0; i < grsoCOLS-1; i++) sts_grs.grsstate[i] = 0; \ + sts_grs.grsstate[grsoCOLS-1] = grsoU64BIG((u64)(8*grsoDIGESTSIZE)); \ + \ + /* set other variables */ \ + sts_grs.grsbuf_ptr = 0; \ + sts_grs.grsblock_counter = 0; \ +} while (0); \ + +/* load hash */ +#define GRS_U \ +do { \ + unsigned char* in = hash; \ + unsigned long long index = 0; \ + \ + /* if the buffer contains data that has not yet been digested, first \ + add data to buffer until full */ \ + if (sts_grs.grsbuf_ptr) { \ + while (sts_grs.grsbuf_ptr < grsoSIZE && index < 64) { \ + hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \ + } \ + if (sts_grs.grsbuf_ptr < grsoSIZE) continue; \ + \ + /* digest buffer */ \ + sts_grs.grsbuf_ptr = 0; \ + grsoTransform(&sts_grs, hashbuf, grsoSIZE); \ + } \ + \ + /* digest bulk of message */ \ + grsoTransform(&sts_grs, in+index, 64-index); \ + index += ((64-index)/grsoSIZE)*grsoSIZE; \ + \ + /* store remaining data in buffer */ \ + while (index < 64) { \ + hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \ + } \ + \ +} while (0); + +/* groestl512 hash loaded */ +/* hash = groestl512(loaded) */ +#define GRS_C \ +do { \ + char *out = hash; \ + int i, j = 0; \ + unsigned char *s = (unsigned char*)sts_grs.grsstate; \ + \ + hashbuf[sts_grs.grsbuf_ptr++] = 0x80; \ + \ + /* pad with '0'-bits */ \ + if (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \ + /* padding requires two blocks */ \ + while (sts_grs.grsbuf_ptr < grsoSIZE) { \ + hashbuf[sts_grs.grsbuf_ptr++] = 0; \ + } \ + /* digest first padding block */ \ + grsoTransform(&sts_grs, hashbuf, grsoSIZE); \ + sts_grs.grsbuf_ptr = 0; \ + } \ + while (sts_grs.grsbuf_ptr < grsoSIZE-grsoLENGTHFIELDLEN) { \ + hashbuf[sts_grs.grsbuf_ptr++] = 0; \ + } \ + \ + /* length padding */ \ + sts_grs.grsblock_counter++; \ + sts_grs.grsbuf_ptr = grsoSIZE; \ + while (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \ + hashbuf[--sts_grs.grsbuf_ptr] = (unsigned char)sts_grs.grsblock_counter; \ + sts_grs.grsblock_counter >>= 8; \ + } \ + \ + /* digest final padding block */ \ + grsoTransform(&sts_grs, hashbuf, grsoSIZE); \ + /* perform output transformation */ \ + grsoOutputTransformation(&sts_grs); \ + \ + /* store hash result in output */ \ + for (i = grsoSIZE-grsoDIGESTSIZE; i < grsoSIZE; i++,j++) { \ + out[j] = s[i]; \ + } \ + \ + /* zeroise relevant variables and deallocate memory */ \ + for (i = 0; i < grsoCOLS; i++) { \ + sts_grs.grsstate[i] = 0; \ + } \ + for (i = 0; i < grsoSIZE; i++) { \ + hashbuf[i] = 0; \ + } \ +} while (0); + + diff --git a/algo/groestl/sse2/grso.c b/algo/groestl/sse2/grso.c new file mode 100644 index 0000000..19de648 --- /dev/null +++ b/algo/groestl/sse2/grso.c @@ -0,0 +1,57 @@ +/* hash.c January 2011 + * + * Groestl-512 implementation with inline assembly containing mmx and + * sse instructions. Optimized for Opteron. + * Authors: Krystian Matusiewicz and Soeren S. Thomsen + * + * This code is placed in the public domain + */ + +#include "algo/groestl/sse2/grso-asm.h" +#include "algo/groestl/sse2/grso.h" +#include "algo/groestl/sse2/grsotab.h" + +/* digest up to len bytes of input (full blocks only) */ +void grsoTransform(grsoState *ctx, + const unsigned char *in, + unsigned long long len) { + u64 y[grsoCOLS+2] __attribute__ ((aligned (16))); + u64 z[grsoCOLS+2] __attribute__ ((aligned (16))); + u64 *m, *h = (u64*)ctx->grsstate; + int i; + + /* increment block counter */ + ctx->grsblock_counter += len/grsoSIZE; + + /* digest message, one block at a time */ + for (; len >= grsoSIZE; len -= grsoSIZE, in += grsoSIZE) { + m = (u64*)in; + for (i = 0; i < grsoCOLS; i++) { + y[i] = m[i]; + z[i] = m[i] ^ h[i]; + } + + grsoQ1024ASM(y); + grsoP1024ASM(z); + + /* h' == h + Q(m) + P(h+m) */ + for (i = 0; i < grsoCOLS; i++) { + h[i] ^= z[i] ^ y[i]; + } + } +} + +/* given state h, do h <- P(h)+h */ +void grsoOutputTransformation(grsoState *ctx) { + u64 z[grsoCOLS] __attribute__ ((aligned (16))); + int j; + + for (j = 0; j < grsoCOLS; j++) { + z[j] = ctx->grsstate[j]; + } + grsoP1024ASM(z); + for (j = 0; j < grsoCOLS; j++) { + ctx->grsstate[j] ^= z[j]; + } +} + diff --git a/algo/groestl/sse2/grso.h b/algo/groestl/sse2/grso.h new file mode 100644 index 0000000..c0b513e --- /dev/null +++ b/algo/groestl/sse2/grso.h @@ -0,0 +1,62 @@ +#ifndef __hash_h +#define __hash_h + +#include +#include +#include "brg_endian.h" +#include "brg_types.h" + +/* some sizes (number of bytes) */ +#define grsoROWS 8 +#define grsoLENGTHFIELDLEN grsoROWS +#define grsoCOLS 16 +#define grsoSIZE (grsoROWS*grsoCOLS) +#define grsoDIGESTSIZE 64 + +#define grsoROUNDS 14 + +#define grsoROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&((u64)0xffffffffffffffffULL)) + +#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) +#error +#endif /* IS_BIG_ENDIAN */ + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) +#define grsoU64BIG(a) \ + ((grsoROTL64(a, 8) & ((u64)0x000000ff000000ffULL)) | \ + (grsoROTL64(a,24) & ((u64)0x0000ff000000ff00ULL)) | \ + (grsoROTL64(a,40) & ((u64)0x00ff000000ff0000ULL)) | \ + (grsoROTL64(a,56) & ((u64)0xff000000ff000000ULL))) +#endif /* IS_LITTLE_ENDIAN */ + +typedef struct { + u64 grsstate[grsoCOLS]; /* actual state */ + u64 grsblock_counter; /* message block counter */ + int grsbuf_ptr; /* data buffer pointer */ +} grsoState; + +//extern int grsoInit(grsoState* ctx); +//extern int grsoUpdate(grsoState* ctx, const unsigned char* in, +// unsigned long long len); +//extern int grsoUpdateq(grsoState* ctx, const unsigned char* in); +//extern int grsoFinal(grsoState* ctx, +// unsigned char* out); +// +//extern int grsohash(unsigned char *out, +// const unsigned char *in, +// unsigned long long len); + +/* digest up to len bytes of input (full blocks only) */ +void grsoTransform( grsoState *ctx, const unsigned char *in, + unsigned long long len ); + +/* given state h, do h <- P(h)+h */ +void grsoOutputTransformation( grsoState *ctx ); + +int grso_init ( grsoState* sts_grs ); +int grso_update ( grsoState* sts_grs, char* hashbuf, char* hash ); +int grso_close ( grsoState *sts_grs, char* hashbuf, char* hash ); + + +#endif /* __hash_h */ diff --git a/algo/groestl/sse2/grsotab.h b/algo/groestl/sse2/grsotab.h new file mode 100644 index 0000000..ebb040d --- /dev/null +++ b/algo/groestl/sse2/grsotab.h @@ -0,0 +1,23 @@ +#ifndef __tables_h +#define __tables_h + +#include "grso.h" + +__attribute__ ((aligned (16))) const u64 grsoT0[256] = +{0xc6a597f4a5f432c6ULL,0xf884eb9784976ff8ULL,0xee99c7b099b05eeeULL,0xf68df78c8d8c7af6ULL,0xff0de5170d17e8ffULL,0xd6bdb7dcbddc0ad6ULL,0xdeb1a7c8b1c816deULL,0x915439fc54fc6d91ULL,0x6050c0f050f09060ULL,0x0203040503050702ULL,0xcea987e0a9e02eceULL,0x567dac877d87d156ULL,0xe719d52b192bcce7ULL,0xb56271a662a613b5ULL,0x4de69a31e6317c4dULL,0xec9ac3b59ab559ecULL,0x8f4505cf45cf408fULL,0x1f9d3ebc9dbca31fULL,0x894009c040c04989ULL,0xfa87ef92879268faULL,0xef15c53f153fd0efULL,0xb2eb7f26eb2694b2ULL,0x8ec90740c940ce8eULL,0xfb0bed1d0b1de6fbULL,0x41ec822fec2f6e41ULL,0xb3677da967a91ab3ULL,0x5ffdbe1cfd1c435fULL,0x45ea8a25ea256045ULL,0x23bf46dabfdaf923ULL,0x53f7a602f7025153ULL,0xe496d3a196a145e4ULL,0x9b5b2ded5bed769bULL,0x75c2ea5dc25d2875ULL,0xe11cd9241c24c5e1ULL,0x3dae7ae9aee9d43dULL,0x4c6a98be6abef24cULL,0x6c5ad8ee5aee826cULL,0x7e41fcc341c3bd7eULL,0xf502f1060206f3f5ULL,0x834f1dd14fd15283ULL,0x685cd0e45ce48c68ULL,0x51f4a207f4075651ULL,0xd134b95c345c8dd1ULL,0xf908e9180818e1f9ULL,0xe293dfae93ae4ce2ULL,0xab734d9573953eabULL,0x6253c4f553f59762ULL,0x2a3f54413f416b2aULL,0x080c10140c141c08ULL,0x955231f652f66395ULL,0x46658caf65afe946ULL,0x9d5e21e25ee27f9dULL,0x3028607828784830ULL,0x37a16ef8a1f8cf37ULL,0x0a0f14110f111b0aULL,0x2fb55ec4b5c4eb2fULL,0x0e091c1b091b150eULL,0x2436485a365a7e24ULL,0x1b9b36b69bb6ad1bULL,0xdf3da5473d4798dfULL,0xcd26816a266aa7cdULL,0x4e699cbb69bbf54eULL,0x7fcdfe4ccd4c337fULL,0xea9fcfba9fba50eaULL,0x121b242d1b2d3f12ULL,0x1d9e3ab99eb9a41dULL,0x5874b09c749cc458ULL,0x342e68722e724634ULL,0x362d6c772d774136ULL,0xdcb2a3cdb2cd11dcULL,0xb4ee7329ee299db4ULL,0x5bfbb616fb164d5bULL,0xa4f65301f601a5a4ULL,0x764decd74dd7a176ULL,0xb76175a361a314b7ULL,0x7dcefa49ce49347dULL,0x527ba48d7b8ddf52ULL,0xdd3ea1423e429fddULL,0x5e71bc937193cd5eULL,0x139726a297a2b113ULL,0xa6f55704f504a2a6ULL,0xb96869b868b801b9ULL,0x0000000000000000ULL,0xc12c99742c74b5c1ULL,0x406080a060a0e040ULL,0xe31fdd211f21c2e3ULL,0x79c8f243c8433a79ULL,0xb6ed772ced2c9ab6ULL,0xd4beb3d9bed90dd4ULL,0x8d4601ca46ca478dULL,0x67d9ce70d9701767ULL,0x724be4dd4bddaf72ULL,0x94de3379de79ed94ULL,0x98d42b67d467ff98ULL,0xb0e87b23e82393b0ULL,0x854a11de4ade5b85ULL,0xbb6b6dbd6bbd06bbULL,0xc52a917e2a7ebbc5ULL,0x4fe59e34e5347b4fULL,0xed16c13a163ad7edULL,0x86c51754c554d286ULL,0x9ad72f62d762f89aULL,0x6655ccff55ff9966ULL,0x119422a794a7b611ULL,0x8acf0f4acf4ac08aULL,0xe910c9301030d9e9ULL,0x0406080a060a0e04ULL,0xfe81e798819866feULL,0xa0f05b0bf00baba0ULL,0x7844f0cc44ccb478ULL,0x25ba4ad5bad5f025ULL,0x4be3963ee33e754bULL,0xa2f35f0ef30eaca2ULL,0x5dfeba19fe19445dULL,0x80c01b5bc05bdb80ULL,0x058a0a858a858005ULL,0x3fad7eecadecd33fULL,0x21bc42dfbcdffe21ULL,0x7048e0d848d8a870ULL,0xf104f90c040cfdf1ULL,0x63dfc67adf7a1963ULL,0x77c1ee58c1582f77ULL,0xaf75459f759f30afULL,0x426384a563a5e742ULL,0x2030405030507020ULL,0xe51ad12e1a2ecbe5ULL,0xfd0ee1120e12effdULL,0xbf6d65b76db708bfULL,0x814c19d44cd45581ULL,0x1814303c143c2418ULL,0x26354c5f355f7926ULL,0xc32f9d712f71b2c3ULL,0xbee16738e13886beULL,0x35a26afda2fdc835ULL,0x88cc0b4fcc4fc788ULL,0x2e395c4b394b652eULL,0x93573df957f96a93ULL,0x55f2aa0df20d5855ULL,0xfc82e39d829d61fcULL,0x7a47f4c947c9b37aULL,0xc8ac8befacef27c8ULL,0xbae76f32e73288baULL,0x322b647d2b7d4f32ULL,0xe695d7a495a442e6ULL,0xc0a09bfba0fb3bc0ULL,0x199832b398b3aa19ULL,0x9ed12768d168f69eULL,0xa37f5d817f8122a3ULL,0x446688aa66aaee44ULL,0x547ea8827e82d654ULL,0x3bab76e6abe6dd3bULL,0x0b83169e839e950bULL,0x8cca0345ca45c98cULL,0xc729957b297bbcc7ULL,0x6bd3d66ed36e056bULL,0x283c50443c446c28ULL,0xa779558b798b2ca7ULL,0xbce2633de23d81bcULL,0x161d2c271d273116ULL,0xad76419a769a37adULL,0xdb3bad4d3b4d96dbULL,0x6456c8fa56fa9e64ULL,0x744ee8d24ed2a674ULL,0x141e28221e223614ULL,0x92db3f76db76e492ULL,0x0c0a181e0a1e120cULL,0x486c90b46cb4fc48ULL,0xb8e46b37e4378fb8ULL,0x9f5d25e75de7789fULL,0xbd6e61b26eb20fbdULL,0x43ef862aef2a6943ULL,0xc4a693f1a6f135c4ULL,0x39a872e3a8e3da39ULL,0x31a462f7a4f7c631ULL,0xd337bd5937598ad3ULL,0xf28bff868b8674f2ULL,0xd532b156325683d5ULL,0x8b430dc543c54e8bULL,0x6e59dceb59eb856eULL,0xdab7afc2b7c218daULL,0x018c028f8c8f8e01ULL,0xb16479ac64ac1db1ULL,0x9cd2236dd26df19cULL,0x49e0923be03b7249ULL,0xd8b4abc7b4c71fd8ULL,0xacfa4315fa15b9acULL,0xf307fd090709faf3ULL,0xcf25856f256fa0cfULL,0xcaaf8feaafea20caULL,0xf48ef3898e897df4ULL,0x47e98e20e9206747ULL,0x1018202818283810ULL,0x6fd5de64d5640b6fULL,0xf088fb83888373f0ULL,0x4a6f94b16fb1fb4aULL,0x5c72b8967296ca5cULL,0x3824706c246c5438ULL,0x57f1ae08f1085f57ULL,0x73c7e652c7522173ULL,0x975135f351f36497ULL,0xcb238d652365aecbULL,0xa17c59847c8425a1ULL,0xe89ccbbf9cbf57e8ULL,0x3e217c6321635d3eULL,0x96dd377cdd7cea96ULL,0x61dcc27fdc7f1e61ULL,0x0d861a9186919c0dULL,0x0f851e9485949b0fULL,0xe090dbab90ab4be0ULL,0x7c42f8c642c6ba7cULL,0x71c4e257c4572671ULL,0xccaa83e5aae529ccULL,0x90d83b73d873e390ULL,0x06050c0f050f0906ULL,0xf701f5030103f4f7ULL,0x1c12383612362a1cULL,0xc2a39ffea3fe3cc2ULL,0x6a5fd4e15fe18b6aULL,0xaef94710f910beaeULL,0x69d0d26bd06b0269ULL,0x17912ea891a8bf17ULL,0x995829e858e87199ULL,0x3a2774692769533aULL,0x27b94ed0b9d0f727ULL,0xd938a948384891d9ULL,0xeb13cd351335deebULL,0x2bb356ceb3cee52bULL,0x2233445533557722ULL,0xd2bbbfd6bbd604d2ULL,0xa9704990709039a9ULL,0x07890e8089808707ULL,0x33a766f2a7f2c133ULL,0x2db65ac1b6c1ec2dULL,0x3c22786622665a3cULL,0x15922aad92adb815ULL,0xc92089602060a9c9ULL,0x874915db49db5c87ULL,0xaaff4f1aff1ab0aaULL,0x5078a0887888d850ULL,0xa57a518e7a8e2ba5ULL,0x038f068a8f8a8903ULL,0x59f8b213f8134a59ULL,0x0980129b809b9209ULL,0x1a1734391739231aULL,0x65daca75da751065ULL,0xd731b553315384d7ULL,0x84c61351c651d584ULL,0xd0b8bbd3b8d303d0ULL,0x82c31f5ec35edc82ULL,0x29b052cbb0cbe229ULL,0x5a77b4997799c35aULL,0x1e113c3311332d1eULL,0x7bcbf646cb463d7bULL,0xa8fc4b1ffc1fb7a8ULL,0x6dd6da61d6610c6dULL,0x2c3a584e3a4e622cULL}; +__attribute__ ((aligned (16))) const u64 grsoT1[256] = +{0xa597f4a5f432c6c6ULL,0x84eb9784976ff8f8ULL,0x99c7b099b05eeeeeULL,0x8df78c8d8c7af6f6ULL,0x0de5170d17e8ffffULL,0xbdb7dcbddc0ad6d6ULL,0xb1a7c8b1c816dedeULL,0x5439fc54fc6d9191ULL,0x50c0f050f0906060ULL,0x0304050305070202ULL,0xa987e0a9e02ececeULL,0x7dac877d87d15656ULL,0x19d52b192bcce7e7ULL,0x6271a662a613b5b5ULL,0xe69a31e6317c4d4dULL,0x9ac3b59ab559ececULL,0x4505cf45cf408f8fULL,0x9d3ebc9dbca31f1fULL,0x4009c040c0498989ULL,0x87ef92879268fafaULL,0x15c53f153fd0efefULL,0xeb7f26eb2694b2b2ULL,0xc90740c940ce8e8eULL,0x0bed1d0b1de6fbfbULL,0xec822fec2f6e4141ULL,0x677da967a91ab3b3ULL,0xfdbe1cfd1c435f5fULL,0xea8a25ea25604545ULL,0xbf46dabfdaf92323ULL,0xf7a602f702515353ULL,0x96d3a196a145e4e4ULL,0x5b2ded5bed769b9bULL,0xc2ea5dc25d287575ULL,0x1cd9241c24c5e1e1ULL,0xae7ae9aee9d43d3dULL,0x6a98be6abef24c4cULL,0x5ad8ee5aee826c6cULL,0x41fcc341c3bd7e7eULL,0x02f1060206f3f5f5ULL,0x4f1dd14fd1528383ULL,0x5cd0e45ce48c6868ULL,0xf4a207f407565151ULL,0x34b95c345c8dd1d1ULL,0x08e9180818e1f9f9ULL,0x93dfae93ae4ce2e2ULL,0x734d9573953eababULL,0x53c4f553f5976262ULL,0x3f54413f416b2a2aULL,0x0c10140c141c0808ULL,0x5231f652f6639595ULL,0x658caf65afe94646ULL,0x5e21e25ee27f9d9dULL,0x2860782878483030ULL,0xa16ef8a1f8cf3737ULL,0x0f14110f111b0a0aULL,0xb55ec4b5c4eb2f2fULL,0x091c1b091b150e0eULL,0x36485a365a7e2424ULL,0x9b36b69bb6ad1b1bULL,0x3da5473d4798dfdfULL,0x26816a266aa7cdcdULL,0x699cbb69bbf54e4eULL,0xcdfe4ccd4c337f7fULL,0x9fcfba9fba50eaeaULL,0x1b242d1b2d3f1212ULL,0x9e3ab99eb9a41d1dULL,0x74b09c749cc45858ULL,0x2e68722e72463434ULL,0x2d6c772d77413636ULL,0xb2a3cdb2cd11dcdcULL,0xee7329ee299db4b4ULL,0xfbb616fb164d5b5bULL,0xf65301f601a5a4a4ULL,0x4decd74dd7a17676ULL,0x6175a361a314b7b7ULL,0xcefa49ce49347d7dULL,0x7ba48d7b8ddf5252ULL,0x3ea1423e429fddddULL,0x71bc937193cd5e5eULL,0x9726a297a2b11313ULL,0xf55704f504a2a6a6ULL,0x6869b868b801b9b9ULL,0x0000000000000000ULL,0x2c99742c74b5c1c1ULL,0x6080a060a0e04040ULL,0x1fdd211f21c2e3e3ULL,0xc8f243c8433a7979ULL,0xed772ced2c9ab6b6ULL,0xbeb3d9bed90dd4d4ULL,0x4601ca46ca478d8dULL,0xd9ce70d970176767ULL,0x4be4dd4bddaf7272ULL,0xde3379de79ed9494ULL,0xd42b67d467ff9898ULL,0xe87b23e82393b0b0ULL,0x4a11de4ade5b8585ULL,0x6b6dbd6bbd06bbbbULL,0x2a917e2a7ebbc5c5ULL,0xe59e34e5347b4f4fULL,0x16c13a163ad7ededULL,0xc51754c554d28686ULL,0xd72f62d762f89a9aULL,0x55ccff55ff996666ULL,0x9422a794a7b61111ULL,0xcf0f4acf4ac08a8aULL,0x10c9301030d9e9e9ULL,0x06080a060a0e0404ULL,0x81e798819866fefeULL,0xf05b0bf00baba0a0ULL,0x44f0cc44ccb47878ULL,0xba4ad5bad5f02525ULL,0xe3963ee33e754b4bULL,0xf35f0ef30eaca2a2ULL,0xfeba19fe19445d5dULL,0xc01b5bc05bdb8080ULL,0x8a0a858a85800505ULL,0xad7eecadecd33f3fULL,0xbc42dfbcdffe2121ULL,0x48e0d848d8a87070ULL,0x04f90c040cfdf1f1ULL,0xdfc67adf7a196363ULL,0xc1ee58c1582f7777ULL,0x75459f759f30afafULL,0x6384a563a5e74242ULL,0x3040503050702020ULL,0x1ad12e1a2ecbe5e5ULL,0x0ee1120e12effdfdULL,0x6d65b76db708bfbfULL,0x4c19d44cd4558181ULL,0x14303c143c241818ULL,0x354c5f355f792626ULL,0x2f9d712f71b2c3c3ULL,0xe16738e13886bebeULL,0xa26afda2fdc83535ULL,0xcc0b4fcc4fc78888ULL,0x395c4b394b652e2eULL,0x573df957f96a9393ULL,0xf2aa0df20d585555ULL,0x82e39d829d61fcfcULL,0x47f4c947c9b37a7aULL,0xac8befacef27c8c8ULL,0xe76f32e73288babaULL,0x2b647d2b7d4f3232ULL,0x95d7a495a442e6e6ULL,0xa09bfba0fb3bc0c0ULL,0x9832b398b3aa1919ULL,0xd12768d168f69e9eULL,0x7f5d817f8122a3a3ULL,0x6688aa66aaee4444ULL,0x7ea8827e82d65454ULL,0xab76e6abe6dd3b3bULL,0x83169e839e950b0bULL,0xca0345ca45c98c8cULL,0x29957b297bbcc7c7ULL,0xd3d66ed36e056b6bULL,0x3c50443c446c2828ULL,0x79558b798b2ca7a7ULL,0xe2633de23d81bcbcULL,0x1d2c271d27311616ULL,0x76419a769a37adadULL,0x3bad4d3b4d96dbdbULL,0x56c8fa56fa9e6464ULL,0x4ee8d24ed2a67474ULL,0x1e28221e22361414ULL,0xdb3f76db76e49292ULL,0x0a181e0a1e120c0cULL,0x6c90b46cb4fc4848ULL,0xe46b37e4378fb8b8ULL,0x5d25e75de7789f9fULL,0x6e61b26eb20fbdbdULL,0xef862aef2a694343ULL,0xa693f1a6f135c4c4ULL,0xa872e3a8e3da3939ULL,0xa462f7a4f7c63131ULL,0x37bd5937598ad3d3ULL,0x8bff868b8674f2f2ULL,0x32b156325683d5d5ULL,0x430dc543c54e8b8bULL,0x59dceb59eb856e6eULL,0xb7afc2b7c218dadaULL,0x8c028f8c8f8e0101ULL,0x6479ac64ac1db1b1ULL,0xd2236dd26df19c9cULL,0xe0923be03b724949ULL,0xb4abc7b4c71fd8d8ULL,0xfa4315fa15b9acacULL,0x07fd090709faf3f3ULL,0x25856f256fa0cfcfULL,0xaf8feaafea20cacaULL,0x8ef3898e897df4f4ULL,0xe98e20e920674747ULL,0x1820281828381010ULL,0xd5de64d5640b6f6fULL,0x88fb83888373f0f0ULL,0x6f94b16fb1fb4a4aULL,0x72b8967296ca5c5cULL,0x24706c246c543838ULL,0xf1ae08f1085f5757ULL,0xc7e652c752217373ULL,0x5135f351f3649797ULL,0x238d652365aecbcbULL,0x7c59847c8425a1a1ULL,0x9ccbbf9cbf57e8e8ULL,0x217c6321635d3e3eULL,0xdd377cdd7cea9696ULL,0xdcc27fdc7f1e6161ULL,0x861a9186919c0d0dULL,0x851e9485949b0f0fULL,0x90dbab90ab4be0e0ULL,0x42f8c642c6ba7c7cULL,0xc4e257c457267171ULL,0xaa83e5aae529ccccULL,0xd83b73d873e39090ULL,0x050c0f050f090606ULL,0x01f5030103f4f7f7ULL,0x12383612362a1c1cULL,0xa39ffea3fe3cc2c2ULL,0x5fd4e15fe18b6a6aULL,0xf94710f910beaeaeULL,0xd0d26bd06b026969ULL,0x912ea891a8bf1717ULL,0x5829e858e8719999ULL,0x2774692769533a3aULL,0xb94ed0b9d0f72727ULL,0x38a948384891d9d9ULL,0x13cd351335deebebULL,0xb356ceb3cee52b2bULL,0x3344553355772222ULL,0xbbbfd6bbd604d2d2ULL,0x704990709039a9a9ULL,0x890e808980870707ULL,0xa766f2a7f2c13333ULL,0xb65ac1b6c1ec2d2dULL,0x22786622665a3c3cULL,0x922aad92adb81515ULL,0x2089602060a9c9c9ULL,0x4915db49db5c8787ULL,0xff4f1aff1ab0aaaaULL,0x78a0887888d85050ULL,0x7a518e7a8e2ba5a5ULL,0x8f068a8f8a890303ULL,0xf8b213f8134a5959ULL,0x80129b809b920909ULL,0x1734391739231a1aULL,0xdaca75da75106565ULL,0x31b553315384d7d7ULL,0xc61351c651d58484ULL,0xb8bbd3b8d303d0d0ULL,0xc31f5ec35edc8282ULL,0xb052cbb0cbe22929ULL,0x77b4997799c35a5aULL,0x113c3311332d1e1eULL,0xcbf646cb463d7b7bULL,0xfc4b1ffc1fb7a8a8ULL,0xd6da61d6610c6d6dULL,0x3a584e3a4e622c2cULL}; +__attribute__ ((aligned (16))) const u64 grsoT2[256] = +{0x97f4a5f432c6c6a5ULL,0xeb9784976ff8f884ULL,0xc7b099b05eeeee99ULL,0xf78c8d8c7af6f68dULL,0xe5170d17e8ffff0dULL,0xb7dcbddc0ad6d6bdULL,0xa7c8b1c816dedeb1ULL,0x39fc54fc6d919154ULL,0xc0f050f090606050ULL,0x0405030507020203ULL,0x87e0a9e02ececea9ULL,0xac877d87d156567dULL,0xd52b192bcce7e719ULL,0x71a662a613b5b562ULL,0x9a31e6317c4d4de6ULL,0xc3b59ab559ecec9aULL,0x05cf45cf408f8f45ULL,0x3ebc9dbca31f1f9dULL,0x09c040c049898940ULL,0xef92879268fafa87ULL,0xc53f153fd0efef15ULL,0x7f26eb2694b2b2ebULL,0x0740c940ce8e8ec9ULL,0xed1d0b1de6fbfb0bULL,0x822fec2f6e4141ecULL,0x7da967a91ab3b367ULL,0xbe1cfd1c435f5ffdULL,0x8a25ea25604545eaULL,0x46dabfdaf92323bfULL,0xa602f702515353f7ULL,0xd3a196a145e4e496ULL,0x2ded5bed769b9b5bULL,0xea5dc25d287575c2ULL,0xd9241c24c5e1e11cULL,0x7ae9aee9d43d3daeULL,0x98be6abef24c4c6aULL,0xd8ee5aee826c6c5aULL,0xfcc341c3bd7e7e41ULL,0xf1060206f3f5f502ULL,0x1dd14fd15283834fULL,0xd0e45ce48c68685cULL,0xa207f407565151f4ULL,0xb95c345c8dd1d134ULL,0xe9180818e1f9f908ULL,0xdfae93ae4ce2e293ULL,0x4d9573953eabab73ULL,0xc4f553f597626253ULL,0x54413f416b2a2a3fULL,0x10140c141c08080cULL,0x31f652f663959552ULL,0x8caf65afe9464665ULL,0x21e25ee27f9d9d5eULL,0x6078287848303028ULL,0x6ef8a1f8cf3737a1ULL,0x14110f111b0a0a0fULL,0x5ec4b5c4eb2f2fb5ULL,0x1c1b091b150e0e09ULL,0x485a365a7e242436ULL,0x36b69bb6ad1b1b9bULL,0xa5473d4798dfdf3dULL,0x816a266aa7cdcd26ULL,0x9cbb69bbf54e4e69ULL,0xfe4ccd4c337f7fcdULL,0xcfba9fba50eaea9fULL,0x242d1b2d3f12121bULL,0x3ab99eb9a41d1d9eULL,0xb09c749cc4585874ULL,0x68722e724634342eULL,0x6c772d774136362dULL,0xa3cdb2cd11dcdcb2ULL,0x7329ee299db4b4eeULL,0xb616fb164d5b5bfbULL,0x5301f601a5a4a4f6ULL,0xecd74dd7a176764dULL,0x75a361a314b7b761ULL,0xfa49ce49347d7dceULL,0xa48d7b8ddf52527bULL,0xa1423e429fdddd3eULL,0xbc937193cd5e5e71ULL,0x26a297a2b1131397ULL,0x5704f504a2a6a6f5ULL,0x69b868b801b9b968ULL,0x0000000000000000ULL,0x99742c74b5c1c12cULL,0x80a060a0e0404060ULL,0xdd211f21c2e3e31fULL,0xf243c8433a7979c8ULL,0x772ced2c9ab6b6edULL,0xb3d9bed90dd4d4beULL,0x01ca46ca478d8d46ULL,0xce70d970176767d9ULL,0xe4dd4bddaf72724bULL,0x3379de79ed9494deULL,0x2b67d467ff9898d4ULL,0x7b23e82393b0b0e8ULL,0x11de4ade5b85854aULL,0x6dbd6bbd06bbbb6bULL,0x917e2a7ebbc5c52aULL,0x9e34e5347b4f4fe5ULL,0xc13a163ad7eded16ULL,0x1754c554d28686c5ULL,0x2f62d762f89a9ad7ULL,0xccff55ff99666655ULL,0x22a794a7b6111194ULL,0x0f4acf4ac08a8acfULL,0xc9301030d9e9e910ULL,0x080a060a0e040406ULL,0xe798819866fefe81ULL,0x5b0bf00baba0a0f0ULL,0xf0cc44ccb4787844ULL,0x4ad5bad5f02525baULL,0x963ee33e754b4be3ULL,0x5f0ef30eaca2a2f3ULL,0xba19fe19445d5dfeULL,0x1b5bc05bdb8080c0ULL,0x0a858a858005058aULL,0x7eecadecd33f3fadULL,0x42dfbcdffe2121bcULL,0xe0d848d8a8707048ULL,0xf90c040cfdf1f104ULL,0xc67adf7a196363dfULL,0xee58c1582f7777c1ULL,0x459f759f30afaf75ULL,0x84a563a5e7424263ULL,0x4050305070202030ULL,0xd12e1a2ecbe5e51aULL,0xe1120e12effdfd0eULL,0x65b76db708bfbf6dULL,0x19d44cd45581814cULL,0x303c143c24181814ULL,0x4c5f355f79262635ULL,0x9d712f71b2c3c32fULL,0x6738e13886bebee1ULL,0x6afda2fdc83535a2ULL,0x0b4fcc4fc78888ccULL,0x5c4b394b652e2e39ULL,0x3df957f96a939357ULL,0xaa0df20d585555f2ULL,0xe39d829d61fcfc82ULL,0xf4c947c9b37a7a47ULL,0x8befacef27c8c8acULL,0x6f32e73288babae7ULL,0x647d2b7d4f32322bULL,0xd7a495a442e6e695ULL,0x9bfba0fb3bc0c0a0ULL,0x32b398b3aa191998ULL,0x2768d168f69e9ed1ULL,0x5d817f8122a3a37fULL,0x88aa66aaee444466ULL,0xa8827e82d654547eULL,0x76e6abe6dd3b3babULL,0x169e839e950b0b83ULL,0x0345ca45c98c8ccaULL,0x957b297bbcc7c729ULL,0xd66ed36e056b6bd3ULL,0x50443c446c28283cULL,0x558b798b2ca7a779ULL,0x633de23d81bcbce2ULL,0x2c271d273116161dULL,0x419a769a37adad76ULL,0xad4d3b4d96dbdb3bULL,0xc8fa56fa9e646456ULL,0xe8d24ed2a674744eULL,0x28221e223614141eULL,0x3f76db76e49292dbULL,0x181e0a1e120c0c0aULL,0x90b46cb4fc48486cULL,0x6b37e4378fb8b8e4ULL,0x25e75de7789f9f5dULL,0x61b26eb20fbdbd6eULL,0x862aef2a694343efULL,0x93f1a6f135c4c4a6ULL,0x72e3a8e3da3939a8ULL,0x62f7a4f7c63131a4ULL,0xbd5937598ad3d337ULL,0xff868b8674f2f28bULL,0xb156325683d5d532ULL,0x0dc543c54e8b8b43ULL,0xdceb59eb856e6e59ULL,0xafc2b7c218dadab7ULL,0x028f8c8f8e01018cULL,0x79ac64ac1db1b164ULL,0x236dd26df19c9cd2ULL,0x923be03b724949e0ULL,0xabc7b4c71fd8d8b4ULL,0x4315fa15b9acacfaULL,0xfd090709faf3f307ULL,0x856f256fa0cfcf25ULL,0x8feaafea20cacaafULL,0xf3898e897df4f48eULL,0x8e20e920674747e9ULL,0x2028182838101018ULL,0xde64d5640b6f6fd5ULL,0xfb83888373f0f088ULL,0x94b16fb1fb4a4a6fULL,0xb8967296ca5c5c72ULL,0x706c246c54383824ULL,0xae08f1085f5757f1ULL,0xe652c752217373c7ULL,0x35f351f364979751ULL,0x8d652365aecbcb23ULL,0x59847c8425a1a17cULL,0xcbbf9cbf57e8e89cULL,0x7c6321635d3e3e21ULL,0x377cdd7cea9696ddULL,0xc27fdc7f1e6161dcULL,0x1a9186919c0d0d86ULL,0x1e9485949b0f0f85ULL,0xdbab90ab4be0e090ULL,0xf8c642c6ba7c7c42ULL,0xe257c457267171c4ULL,0x83e5aae529ccccaaULL,0x3b73d873e39090d8ULL,0x0c0f050f09060605ULL,0xf5030103f4f7f701ULL,0x383612362a1c1c12ULL,0x9ffea3fe3cc2c2a3ULL,0xd4e15fe18b6a6a5fULL,0x4710f910beaeaef9ULL,0xd26bd06b026969d0ULL,0x2ea891a8bf171791ULL,0x29e858e871999958ULL,0x74692769533a3a27ULL,0x4ed0b9d0f72727b9ULL,0xa948384891d9d938ULL,0xcd351335deebeb13ULL,0x56ceb3cee52b2bb3ULL,0x4455335577222233ULL,0xbfd6bbd604d2d2bbULL,0x4990709039a9a970ULL,0x0e80898087070789ULL,0x66f2a7f2c13333a7ULL,0x5ac1b6c1ec2d2db6ULL,0x786622665a3c3c22ULL,0x2aad92adb8151592ULL,0x89602060a9c9c920ULL,0x15db49db5c878749ULL,0x4f1aff1ab0aaaaffULL,0xa0887888d8505078ULL,0x518e7a8e2ba5a57aULL,0x068a8f8a8903038fULL,0xb213f8134a5959f8ULL,0x129b809b92090980ULL,0x34391739231a1a17ULL,0xca75da75106565daULL,0xb553315384d7d731ULL,0x1351c651d58484c6ULL,0xbbd3b8d303d0d0b8ULL,0x1f5ec35edc8282c3ULL,0x52cbb0cbe22929b0ULL,0xb4997799c35a5a77ULL,0x3c3311332d1e1e11ULL,0xf646cb463d7b7bcbULL,0x4b1ffc1fb7a8a8fcULL,0xda61d6610c6d6dd6ULL,0x584e3a4e622c2c3aULL}; +__attribute__ ((aligned (16))) const u64 grsoT3[256] = +{0xf4a5f432c6c6a597ULL,0x9784976ff8f884ebULL,0xb099b05eeeee99c7ULL,0x8c8d8c7af6f68df7ULL,0x170d17e8ffff0de5ULL,0xdcbddc0ad6d6bdb7ULL,0xc8b1c816dedeb1a7ULL,0xfc54fc6d91915439ULL,0xf050f090606050c0ULL,0x0503050702020304ULL,0xe0a9e02ececea987ULL,0x877d87d156567dacULL,0x2b192bcce7e719d5ULL,0xa662a613b5b56271ULL,0x31e6317c4d4de69aULL,0xb59ab559ecec9ac3ULL,0xcf45cf408f8f4505ULL,0xbc9dbca31f1f9d3eULL,0xc040c04989894009ULL,0x92879268fafa87efULL,0x3f153fd0efef15c5ULL,0x26eb2694b2b2eb7fULL,0x40c940ce8e8ec907ULL,0x1d0b1de6fbfb0bedULL,0x2fec2f6e4141ec82ULL,0xa967a91ab3b3677dULL,0x1cfd1c435f5ffdbeULL,0x25ea25604545ea8aULL,0xdabfdaf92323bf46ULL,0x02f702515353f7a6ULL,0xa196a145e4e496d3ULL,0xed5bed769b9b5b2dULL,0x5dc25d287575c2eaULL,0x241c24c5e1e11cd9ULL,0xe9aee9d43d3dae7aULL,0xbe6abef24c4c6a98ULL,0xee5aee826c6c5ad8ULL,0xc341c3bd7e7e41fcULL,0x060206f3f5f502f1ULL,0xd14fd15283834f1dULL,0xe45ce48c68685cd0ULL,0x07f407565151f4a2ULL,0x5c345c8dd1d134b9ULL,0x180818e1f9f908e9ULL,0xae93ae4ce2e293dfULL,0x9573953eabab734dULL,0xf553f597626253c4ULL,0x413f416b2a2a3f54ULL,0x140c141c08080c10ULL,0xf652f66395955231ULL,0xaf65afe94646658cULL,0xe25ee27f9d9d5e21ULL,0x7828784830302860ULL,0xf8a1f8cf3737a16eULL,0x110f111b0a0a0f14ULL,0xc4b5c4eb2f2fb55eULL,0x1b091b150e0e091cULL,0x5a365a7e24243648ULL,0xb69bb6ad1b1b9b36ULL,0x473d4798dfdf3da5ULL,0x6a266aa7cdcd2681ULL,0xbb69bbf54e4e699cULL,0x4ccd4c337f7fcdfeULL,0xba9fba50eaea9fcfULL,0x2d1b2d3f12121b24ULL,0xb99eb9a41d1d9e3aULL,0x9c749cc4585874b0ULL,0x722e724634342e68ULL,0x772d774136362d6cULL,0xcdb2cd11dcdcb2a3ULL,0x29ee299db4b4ee73ULL,0x16fb164d5b5bfbb6ULL,0x01f601a5a4a4f653ULL,0xd74dd7a176764decULL,0xa361a314b7b76175ULL,0x49ce49347d7dcefaULL,0x8d7b8ddf52527ba4ULL,0x423e429fdddd3ea1ULL,0x937193cd5e5e71bcULL,0xa297a2b113139726ULL,0x04f504a2a6a6f557ULL,0xb868b801b9b96869ULL,0x0000000000000000ULL,0x742c74b5c1c12c99ULL,0xa060a0e040406080ULL,0x211f21c2e3e31fddULL,0x43c8433a7979c8f2ULL,0x2ced2c9ab6b6ed77ULL,0xd9bed90dd4d4beb3ULL,0xca46ca478d8d4601ULL,0x70d970176767d9ceULL,0xdd4bddaf72724be4ULL,0x79de79ed9494de33ULL,0x67d467ff9898d42bULL,0x23e82393b0b0e87bULL,0xde4ade5b85854a11ULL,0xbd6bbd06bbbb6b6dULL,0x7e2a7ebbc5c52a91ULL,0x34e5347b4f4fe59eULL,0x3a163ad7eded16c1ULL,0x54c554d28686c517ULL,0x62d762f89a9ad72fULL,0xff55ff99666655ccULL,0xa794a7b611119422ULL,0x4acf4ac08a8acf0fULL,0x301030d9e9e910c9ULL,0x0a060a0e04040608ULL,0x98819866fefe81e7ULL,0x0bf00baba0a0f05bULL,0xcc44ccb4787844f0ULL,0xd5bad5f02525ba4aULL,0x3ee33e754b4be396ULL,0x0ef30eaca2a2f35fULL,0x19fe19445d5dfebaULL,0x5bc05bdb8080c01bULL,0x858a858005058a0aULL,0xecadecd33f3fad7eULL,0xdfbcdffe2121bc42ULL,0xd848d8a8707048e0ULL,0x0c040cfdf1f104f9ULL,0x7adf7a196363dfc6ULL,0x58c1582f7777c1eeULL,0x9f759f30afaf7545ULL,0xa563a5e742426384ULL,0x5030507020203040ULL,0x2e1a2ecbe5e51ad1ULL,0x120e12effdfd0ee1ULL,0xb76db708bfbf6d65ULL,0xd44cd45581814c19ULL,0x3c143c2418181430ULL,0x5f355f792626354cULL,0x712f71b2c3c32f9dULL,0x38e13886bebee167ULL,0xfda2fdc83535a26aULL,0x4fcc4fc78888cc0bULL,0x4b394b652e2e395cULL,0xf957f96a9393573dULL,0x0df20d585555f2aaULL,0x9d829d61fcfc82e3ULL,0xc947c9b37a7a47f4ULL,0xefacef27c8c8ac8bULL,0x32e73288babae76fULL,0x7d2b7d4f32322b64ULL,0xa495a442e6e695d7ULL,0xfba0fb3bc0c0a09bULL,0xb398b3aa19199832ULL,0x68d168f69e9ed127ULL,0x817f8122a3a37f5dULL,0xaa66aaee44446688ULL,0x827e82d654547ea8ULL,0xe6abe6dd3b3bab76ULL,0x9e839e950b0b8316ULL,0x45ca45c98c8cca03ULL,0x7b297bbcc7c72995ULL,0x6ed36e056b6bd3d6ULL,0x443c446c28283c50ULL,0x8b798b2ca7a77955ULL,0x3de23d81bcbce263ULL,0x271d273116161d2cULL,0x9a769a37adad7641ULL,0x4d3b4d96dbdb3badULL,0xfa56fa9e646456c8ULL,0xd24ed2a674744ee8ULL,0x221e223614141e28ULL,0x76db76e49292db3fULL,0x1e0a1e120c0c0a18ULL,0xb46cb4fc48486c90ULL,0x37e4378fb8b8e46bULL,0xe75de7789f9f5d25ULL,0xb26eb20fbdbd6e61ULL,0x2aef2a694343ef86ULL,0xf1a6f135c4c4a693ULL,0xe3a8e3da3939a872ULL,0xf7a4f7c63131a462ULL,0x5937598ad3d337bdULL,0x868b8674f2f28bffULL,0x56325683d5d532b1ULL,0xc543c54e8b8b430dULL,0xeb59eb856e6e59dcULL,0xc2b7c218dadab7afULL,0x8f8c8f8e01018c02ULL,0xac64ac1db1b16479ULL,0x6dd26df19c9cd223ULL,0x3be03b724949e092ULL,0xc7b4c71fd8d8b4abULL,0x15fa15b9acacfa43ULL,0x090709faf3f307fdULL,0x6f256fa0cfcf2585ULL,0xeaafea20cacaaf8fULL,0x898e897df4f48ef3ULL,0x20e920674747e98eULL,0x2818283810101820ULL,0x64d5640b6f6fd5deULL,0x83888373f0f088fbULL,0xb16fb1fb4a4a6f94ULL,0x967296ca5c5c72b8ULL,0x6c246c5438382470ULL,0x08f1085f5757f1aeULL,0x52c752217373c7e6ULL,0xf351f36497975135ULL,0x652365aecbcb238dULL,0x847c8425a1a17c59ULL,0xbf9cbf57e8e89ccbULL,0x6321635d3e3e217cULL,0x7cdd7cea9696dd37ULL,0x7fdc7f1e6161dcc2ULL,0x9186919c0d0d861aULL,0x9485949b0f0f851eULL,0xab90ab4be0e090dbULL,0xc642c6ba7c7c42f8ULL,0x57c457267171c4e2ULL,0xe5aae529ccccaa83ULL,0x73d873e39090d83bULL,0x0f050f090606050cULL,0x030103f4f7f701f5ULL,0x3612362a1c1c1238ULL,0xfea3fe3cc2c2a39fULL,0xe15fe18b6a6a5fd4ULL,0x10f910beaeaef947ULL,0x6bd06b026969d0d2ULL,0xa891a8bf1717912eULL,0xe858e87199995829ULL,0x692769533a3a2774ULL,0xd0b9d0f72727b94eULL,0x48384891d9d938a9ULL,0x351335deebeb13cdULL,0xceb3cee52b2bb356ULL,0x5533557722223344ULL,0xd6bbd604d2d2bbbfULL,0x90709039a9a97049ULL,0x808980870707890eULL,0xf2a7f2c13333a766ULL,0xc1b6c1ec2d2db65aULL,0x6622665a3c3c2278ULL,0xad92adb81515922aULL,0x602060a9c9c92089ULL,0xdb49db5c87874915ULL,0x1aff1ab0aaaaff4fULL,0x887888d8505078a0ULL,0x8e7a8e2ba5a57a51ULL,0x8a8f8a8903038f06ULL,0x13f8134a5959f8b2ULL,0x9b809b9209098012ULL,0x391739231a1a1734ULL,0x75da75106565dacaULL,0x53315384d7d731b5ULL,0x51c651d58484c613ULL,0xd3b8d303d0d0b8bbULL,0x5ec35edc8282c31fULL,0xcbb0cbe22929b052ULL,0x997799c35a5a77b4ULL,0x3311332d1e1e113cULL,0x46cb463d7b7bcbf6ULL,0x1ffc1fb7a8a8fc4bULL,0x61d6610c6d6dd6daULL,0x4e3a4e622c2c3a58ULL}; +__attribute__ ((aligned (16))) const u64 grsoT4[256] = +{0xa5f432c6c6a597f4ULL,0x84976ff8f884eb97ULL,0x99b05eeeee99c7b0ULL,0x8d8c7af6f68df78cULL,0x0d17e8ffff0de517ULL,0xbddc0ad6d6bdb7dcULL,0xb1c816dedeb1a7c8ULL,0x54fc6d91915439fcULL,0x50f090606050c0f0ULL,0x0305070202030405ULL,0xa9e02ececea987e0ULL,0x7d87d156567dac87ULL,0x192bcce7e719d52bULL,0x62a613b5b56271a6ULL,0xe6317c4d4de69a31ULL,0x9ab559ecec9ac3b5ULL,0x45cf408f8f4505cfULL,0x9dbca31f1f9d3ebcULL,0x40c04989894009c0ULL,0x879268fafa87ef92ULL,0x153fd0efef15c53fULL,0xeb2694b2b2eb7f26ULL,0xc940ce8e8ec90740ULL,0x0b1de6fbfb0bed1dULL,0xec2f6e4141ec822fULL,0x67a91ab3b3677da9ULL,0xfd1c435f5ffdbe1cULL,0xea25604545ea8a25ULL,0xbfdaf92323bf46daULL,0xf702515353f7a602ULL,0x96a145e4e496d3a1ULL,0x5bed769b9b5b2dedULL,0xc25d287575c2ea5dULL,0x1c24c5e1e11cd924ULL,0xaee9d43d3dae7ae9ULL,0x6abef24c4c6a98beULL,0x5aee826c6c5ad8eeULL,0x41c3bd7e7e41fcc3ULL,0x0206f3f5f502f106ULL,0x4fd15283834f1dd1ULL,0x5ce48c68685cd0e4ULL,0xf407565151f4a207ULL,0x345c8dd1d134b95cULL,0x0818e1f9f908e918ULL,0x93ae4ce2e293dfaeULL,0x73953eabab734d95ULL,0x53f597626253c4f5ULL,0x3f416b2a2a3f5441ULL,0x0c141c08080c1014ULL,0x52f66395955231f6ULL,0x65afe94646658cafULL,0x5ee27f9d9d5e21e2ULL,0x2878483030286078ULL,0xa1f8cf3737a16ef8ULL,0x0f111b0a0a0f1411ULL,0xb5c4eb2f2fb55ec4ULL,0x091b150e0e091c1bULL,0x365a7e242436485aULL,0x9bb6ad1b1b9b36b6ULL,0x3d4798dfdf3da547ULL,0x266aa7cdcd26816aULL,0x69bbf54e4e699cbbULL,0xcd4c337f7fcdfe4cULL,0x9fba50eaea9fcfbaULL,0x1b2d3f12121b242dULL,0x9eb9a41d1d9e3ab9ULL,0x749cc4585874b09cULL,0x2e724634342e6872ULL,0x2d774136362d6c77ULL,0xb2cd11dcdcb2a3cdULL,0xee299db4b4ee7329ULL,0xfb164d5b5bfbb616ULL,0xf601a5a4a4f65301ULL,0x4dd7a176764decd7ULL,0x61a314b7b76175a3ULL,0xce49347d7dcefa49ULL,0x7b8ddf52527ba48dULL,0x3e429fdddd3ea142ULL,0x7193cd5e5e71bc93ULL,0x97a2b113139726a2ULL,0xf504a2a6a6f55704ULL,0x68b801b9b96869b8ULL,0x0000000000000000ULL,0x2c74b5c1c12c9974ULL,0x60a0e040406080a0ULL,0x1f21c2e3e31fdd21ULL,0xc8433a7979c8f243ULL,0xed2c9ab6b6ed772cULL,0xbed90dd4d4beb3d9ULL,0x46ca478d8d4601caULL,0xd970176767d9ce70ULL,0x4bddaf72724be4ddULL,0xde79ed9494de3379ULL,0xd467ff9898d42b67ULL,0xe82393b0b0e87b23ULL,0x4ade5b85854a11deULL,0x6bbd06bbbb6b6dbdULL,0x2a7ebbc5c52a917eULL,0xe5347b4f4fe59e34ULL,0x163ad7eded16c13aULL,0xc554d28686c51754ULL,0xd762f89a9ad72f62ULL,0x55ff99666655ccffULL,0x94a7b611119422a7ULL,0xcf4ac08a8acf0f4aULL,0x1030d9e9e910c930ULL,0x060a0e040406080aULL,0x819866fefe81e798ULL,0xf00baba0a0f05b0bULL,0x44ccb4787844f0ccULL,0xbad5f02525ba4ad5ULL,0xe33e754b4be3963eULL,0xf30eaca2a2f35f0eULL,0xfe19445d5dfeba19ULL,0xc05bdb8080c01b5bULL,0x8a858005058a0a85ULL,0xadecd33f3fad7eecULL,0xbcdffe2121bc42dfULL,0x48d8a8707048e0d8ULL,0x040cfdf1f104f90cULL,0xdf7a196363dfc67aULL,0xc1582f7777c1ee58ULL,0x759f30afaf75459fULL,0x63a5e742426384a5ULL,0x3050702020304050ULL,0x1a2ecbe5e51ad12eULL,0x0e12effdfd0ee112ULL,0x6db708bfbf6d65b7ULL,0x4cd45581814c19d4ULL,0x143c24181814303cULL,0x355f792626354c5fULL,0x2f71b2c3c32f9d71ULL,0xe13886bebee16738ULL,0xa2fdc83535a26afdULL,0xcc4fc78888cc0b4fULL,0x394b652e2e395c4bULL,0x57f96a9393573df9ULL,0xf20d585555f2aa0dULL,0x829d61fcfc82e39dULL,0x47c9b37a7a47f4c9ULL,0xacef27c8c8ac8befULL,0xe73288babae76f32ULL,0x2b7d4f32322b647dULL,0x95a442e6e695d7a4ULL,0xa0fb3bc0c0a09bfbULL,0x98b3aa19199832b3ULL,0xd168f69e9ed12768ULL,0x7f8122a3a37f5d81ULL,0x66aaee44446688aaULL,0x7e82d654547ea882ULL,0xabe6dd3b3bab76e6ULL,0x839e950b0b83169eULL,0xca45c98c8cca0345ULL,0x297bbcc7c729957bULL,0xd36e056b6bd3d66eULL,0x3c446c28283c5044ULL,0x798b2ca7a779558bULL,0xe23d81bcbce2633dULL,0x1d273116161d2c27ULL,0x769a37adad76419aULL,0x3b4d96dbdb3bad4dULL,0x56fa9e646456c8faULL,0x4ed2a674744ee8d2ULL,0x1e223614141e2822ULL,0xdb76e49292db3f76ULL,0x0a1e120c0c0a181eULL,0x6cb4fc48486c90b4ULL,0xe4378fb8b8e46b37ULL,0x5de7789f9f5d25e7ULL,0x6eb20fbdbd6e61b2ULL,0xef2a694343ef862aULL,0xa6f135c4c4a693f1ULL,0xa8e3da3939a872e3ULL,0xa4f7c63131a462f7ULL,0x37598ad3d337bd59ULL,0x8b8674f2f28bff86ULL,0x325683d5d532b156ULL,0x43c54e8b8b430dc5ULL,0x59eb856e6e59dcebULL,0xb7c218dadab7afc2ULL,0x8c8f8e01018c028fULL,0x64ac1db1b16479acULL,0xd26df19c9cd2236dULL,0xe03b724949e0923bULL,0xb4c71fd8d8b4abc7ULL,0xfa15b9acacfa4315ULL,0x0709faf3f307fd09ULL,0x256fa0cfcf25856fULL,0xafea20cacaaf8feaULL,0x8e897df4f48ef389ULL,0xe920674747e98e20ULL,0x1828381010182028ULL,0xd5640b6f6fd5de64ULL,0x888373f0f088fb83ULL,0x6fb1fb4a4a6f94b1ULL,0x7296ca5c5c72b896ULL,0x246c54383824706cULL,0xf1085f5757f1ae08ULL,0xc752217373c7e652ULL,0x51f36497975135f3ULL,0x2365aecbcb238d65ULL,0x7c8425a1a17c5984ULL,0x9cbf57e8e89ccbbfULL,0x21635d3e3e217c63ULL,0xdd7cea9696dd377cULL,0xdc7f1e6161dcc27fULL,0x86919c0d0d861a91ULL,0x85949b0f0f851e94ULL,0x90ab4be0e090dbabULL,0x42c6ba7c7c42f8c6ULL,0xc457267171c4e257ULL,0xaae529ccccaa83e5ULL,0xd873e39090d83b73ULL,0x050f090606050c0fULL,0x0103f4f7f701f503ULL,0x12362a1c1c123836ULL,0xa3fe3cc2c2a39ffeULL,0x5fe18b6a6a5fd4e1ULL,0xf910beaeaef94710ULL,0xd06b026969d0d26bULL,0x91a8bf1717912ea8ULL,0x58e87199995829e8ULL,0x2769533a3a277469ULL,0xb9d0f72727b94ed0ULL,0x384891d9d938a948ULL,0x1335deebeb13cd35ULL,0xb3cee52b2bb356ceULL,0x3355772222334455ULL,0xbbd604d2d2bbbfd6ULL,0x709039a9a9704990ULL,0x8980870707890e80ULL,0xa7f2c13333a766f2ULL,0xb6c1ec2d2db65ac1ULL,0x22665a3c3c227866ULL,0x92adb81515922aadULL,0x2060a9c9c9208960ULL,0x49db5c87874915dbULL,0xff1ab0aaaaff4f1aULL,0x7888d8505078a088ULL,0x7a8e2ba5a57a518eULL,0x8f8a8903038f068aULL,0xf8134a5959f8b213ULL,0x809b92090980129bULL,0x1739231a1a173439ULL,0xda75106565daca75ULL,0x315384d7d731b553ULL,0xc651d58484c61351ULL,0xb8d303d0d0b8bbd3ULL,0xc35edc8282c31f5eULL,0xb0cbe22929b052cbULL,0x7799c35a5a77b499ULL,0x11332d1e1e113c33ULL,0xcb463d7b7bcbf646ULL,0xfc1fb7a8a8fc4b1fULL,0xd6610c6d6dd6da61ULL,0x3a4e622c2c3a584eULL}; +__attribute__ ((aligned (16))) const u64 grsoT5[256] = +{0xf432c6c6a597f4a5ULL,0x976ff8f884eb9784ULL,0xb05eeeee99c7b099ULL,0x8c7af6f68df78c8dULL,0x17e8ffff0de5170dULL,0xdc0ad6d6bdb7dcbdULL,0xc816dedeb1a7c8b1ULL,0xfc6d91915439fc54ULL,0xf090606050c0f050ULL,0x0507020203040503ULL,0xe02ececea987e0a9ULL,0x87d156567dac877dULL,0x2bcce7e719d52b19ULL,0xa613b5b56271a662ULL,0x317c4d4de69a31e6ULL,0xb559ecec9ac3b59aULL,0xcf408f8f4505cf45ULL,0xbca31f1f9d3ebc9dULL,0xc04989894009c040ULL,0x9268fafa87ef9287ULL,0x3fd0efef15c53f15ULL,0x2694b2b2eb7f26ebULL,0x40ce8e8ec90740c9ULL,0x1de6fbfb0bed1d0bULL,0x2f6e4141ec822fecULL,0xa91ab3b3677da967ULL,0x1c435f5ffdbe1cfdULL,0x25604545ea8a25eaULL,0xdaf92323bf46dabfULL,0x02515353f7a602f7ULL,0xa145e4e496d3a196ULL,0xed769b9b5b2ded5bULL,0x5d287575c2ea5dc2ULL,0x24c5e1e11cd9241cULL,0xe9d43d3dae7ae9aeULL,0xbef24c4c6a98be6aULL,0xee826c6c5ad8ee5aULL,0xc3bd7e7e41fcc341ULL,0x06f3f5f502f10602ULL,0xd15283834f1dd14fULL,0xe48c68685cd0e45cULL,0x07565151f4a207f4ULL,0x5c8dd1d134b95c34ULL,0x18e1f9f908e91808ULL,0xae4ce2e293dfae93ULL,0x953eabab734d9573ULL,0xf597626253c4f553ULL,0x416b2a2a3f54413fULL,0x141c08080c10140cULL,0xf66395955231f652ULL,0xafe94646658caf65ULL,0xe27f9d9d5e21e25eULL,0x7848303028607828ULL,0xf8cf3737a16ef8a1ULL,0x111b0a0a0f14110fULL,0xc4eb2f2fb55ec4b5ULL,0x1b150e0e091c1b09ULL,0x5a7e242436485a36ULL,0xb6ad1b1b9b36b69bULL,0x4798dfdf3da5473dULL,0x6aa7cdcd26816a26ULL,0xbbf54e4e699cbb69ULL,0x4c337f7fcdfe4ccdULL,0xba50eaea9fcfba9fULL,0x2d3f12121b242d1bULL,0xb9a41d1d9e3ab99eULL,0x9cc4585874b09c74ULL,0x724634342e68722eULL,0x774136362d6c772dULL,0xcd11dcdcb2a3cdb2ULL,0x299db4b4ee7329eeULL,0x164d5b5bfbb616fbULL,0x01a5a4a4f65301f6ULL,0xd7a176764decd74dULL,0xa314b7b76175a361ULL,0x49347d7dcefa49ceULL,0x8ddf52527ba48d7bULL,0x429fdddd3ea1423eULL,0x93cd5e5e71bc9371ULL,0xa2b113139726a297ULL,0x04a2a6a6f55704f5ULL,0xb801b9b96869b868ULL,0x0000000000000000ULL,0x74b5c1c12c99742cULL,0xa0e040406080a060ULL,0x21c2e3e31fdd211fULL,0x433a7979c8f243c8ULL,0x2c9ab6b6ed772cedULL,0xd90dd4d4beb3d9beULL,0xca478d8d4601ca46ULL,0x70176767d9ce70d9ULL,0xddaf72724be4dd4bULL,0x79ed9494de3379deULL,0x67ff9898d42b67d4ULL,0x2393b0b0e87b23e8ULL,0xde5b85854a11de4aULL,0xbd06bbbb6b6dbd6bULL,0x7ebbc5c52a917e2aULL,0x347b4f4fe59e34e5ULL,0x3ad7eded16c13a16ULL,0x54d28686c51754c5ULL,0x62f89a9ad72f62d7ULL,0xff99666655ccff55ULL,0xa7b611119422a794ULL,0x4ac08a8acf0f4acfULL,0x30d9e9e910c93010ULL,0x0a0e040406080a06ULL,0x9866fefe81e79881ULL,0x0baba0a0f05b0bf0ULL,0xccb4787844f0cc44ULL,0xd5f02525ba4ad5baULL,0x3e754b4be3963ee3ULL,0x0eaca2a2f35f0ef3ULL,0x19445d5dfeba19feULL,0x5bdb8080c01b5bc0ULL,0x858005058a0a858aULL,0xecd33f3fad7eecadULL,0xdffe2121bc42dfbcULL,0xd8a8707048e0d848ULL,0x0cfdf1f104f90c04ULL,0x7a196363dfc67adfULL,0x582f7777c1ee58c1ULL,0x9f30afaf75459f75ULL,0xa5e742426384a563ULL,0x5070202030405030ULL,0x2ecbe5e51ad12e1aULL,0x12effdfd0ee1120eULL,0xb708bfbf6d65b76dULL,0xd45581814c19d44cULL,0x3c24181814303c14ULL,0x5f792626354c5f35ULL,0x71b2c3c32f9d712fULL,0x3886bebee16738e1ULL,0xfdc83535a26afda2ULL,0x4fc78888cc0b4fccULL,0x4b652e2e395c4b39ULL,0xf96a9393573df957ULL,0x0d585555f2aa0df2ULL,0x9d61fcfc82e39d82ULL,0xc9b37a7a47f4c947ULL,0xef27c8c8ac8befacULL,0x3288babae76f32e7ULL,0x7d4f32322b647d2bULL,0xa442e6e695d7a495ULL,0xfb3bc0c0a09bfba0ULL,0xb3aa19199832b398ULL,0x68f69e9ed12768d1ULL,0x8122a3a37f5d817fULL,0xaaee44446688aa66ULL,0x82d654547ea8827eULL,0xe6dd3b3bab76e6abULL,0x9e950b0b83169e83ULL,0x45c98c8cca0345caULL,0x7bbcc7c729957b29ULL,0x6e056b6bd3d66ed3ULL,0x446c28283c50443cULL,0x8b2ca7a779558b79ULL,0x3d81bcbce2633de2ULL,0x273116161d2c271dULL,0x9a37adad76419a76ULL,0x4d96dbdb3bad4d3bULL,0xfa9e646456c8fa56ULL,0xd2a674744ee8d24eULL,0x223614141e28221eULL,0x76e49292db3f76dbULL,0x1e120c0c0a181e0aULL,0xb4fc48486c90b46cULL,0x378fb8b8e46b37e4ULL,0xe7789f9f5d25e75dULL,0xb20fbdbd6e61b26eULL,0x2a694343ef862aefULL,0xf135c4c4a693f1a6ULL,0xe3da3939a872e3a8ULL,0xf7c63131a462f7a4ULL,0x598ad3d337bd5937ULL,0x8674f2f28bff868bULL,0x5683d5d532b15632ULL,0xc54e8b8b430dc543ULL,0xeb856e6e59dceb59ULL,0xc218dadab7afc2b7ULL,0x8f8e01018c028f8cULL,0xac1db1b16479ac64ULL,0x6df19c9cd2236dd2ULL,0x3b724949e0923be0ULL,0xc71fd8d8b4abc7b4ULL,0x15b9acacfa4315faULL,0x09faf3f307fd0907ULL,0x6fa0cfcf25856f25ULL,0xea20cacaaf8feaafULL,0x897df4f48ef3898eULL,0x20674747e98e20e9ULL,0x2838101018202818ULL,0x640b6f6fd5de64d5ULL,0x8373f0f088fb8388ULL,0xb1fb4a4a6f94b16fULL,0x96ca5c5c72b89672ULL,0x6c54383824706c24ULL,0x085f5757f1ae08f1ULL,0x52217373c7e652c7ULL,0xf36497975135f351ULL,0x65aecbcb238d6523ULL,0x8425a1a17c59847cULL,0xbf57e8e89ccbbf9cULL,0x635d3e3e217c6321ULL,0x7cea9696dd377cddULL,0x7f1e6161dcc27fdcULL,0x919c0d0d861a9186ULL,0x949b0f0f851e9485ULL,0xab4be0e090dbab90ULL,0xc6ba7c7c42f8c642ULL,0x57267171c4e257c4ULL,0xe529ccccaa83e5aaULL,0x73e39090d83b73d8ULL,0x0f090606050c0f05ULL,0x03f4f7f701f50301ULL,0x362a1c1c12383612ULL,0xfe3cc2c2a39ffea3ULL,0xe18b6a6a5fd4e15fULL,0x10beaeaef94710f9ULL,0x6b026969d0d26bd0ULL,0xa8bf1717912ea891ULL,0xe87199995829e858ULL,0x69533a3a27746927ULL,0xd0f72727b94ed0b9ULL,0x4891d9d938a94838ULL,0x35deebeb13cd3513ULL,0xcee52b2bb356ceb3ULL,0x5577222233445533ULL,0xd604d2d2bbbfd6bbULL,0x9039a9a970499070ULL,0x80870707890e8089ULL,0xf2c13333a766f2a7ULL,0xc1ec2d2db65ac1b6ULL,0x665a3c3c22786622ULL,0xadb81515922aad92ULL,0x60a9c9c920896020ULL,0xdb5c87874915db49ULL,0x1ab0aaaaff4f1affULL,0x88d8505078a08878ULL,0x8e2ba5a57a518e7aULL,0x8a8903038f068a8fULL,0x134a5959f8b213f8ULL,0x9b92090980129b80ULL,0x39231a1a17343917ULL,0x75106565daca75daULL,0x5384d7d731b55331ULL,0x51d58484c61351c6ULL,0xd303d0d0b8bbd3b8ULL,0x5edc8282c31f5ec3ULL,0xcbe22929b052cbb0ULL,0x99c35a5a77b49977ULL,0x332d1e1e113c3311ULL,0x463d7b7bcbf646cbULL,0x1fb7a8a8fc4b1ffcULL,0x610c6d6dd6da61d6ULL,0x4e622c2c3a584e3aULL}; +__attribute__ ((aligned (16))) const u64 grsoT6[256] = +{0x32c6c6a597f4a5f4ULL,0x6ff8f884eb978497ULL,0x5eeeee99c7b099b0ULL,0x7af6f68df78c8d8cULL,0xe8ffff0de5170d17ULL,0x0ad6d6bdb7dcbddcULL,0x16dedeb1a7c8b1c8ULL,0x6d91915439fc54fcULL,0x90606050c0f050f0ULL,0x0702020304050305ULL,0x2ececea987e0a9e0ULL,0xd156567dac877d87ULL,0xcce7e719d52b192bULL,0x13b5b56271a662a6ULL,0x7c4d4de69a31e631ULL,0x59ecec9ac3b59ab5ULL,0x408f8f4505cf45cfULL,0xa31f1f9d3ebc9dbcULL,0x4989894009c040c0ULL,0x68fafa87ef928792ULL,0xd0efef15c53f153fULL,0x94b2b2eb7f26eb26ULL,0xce8e8ec90740c940ULL,0xe6fbfb0bed1d0b1dULL,0x6e4141ec822fec2fULL,0x1ab3b3677da967a9ULL,0x435f5ffdbe1cfd1cULL,0x604545ea8a25ea25ULL,0xf92323bf46dabfdaULL,0x515353f7a602f702ULL,0x45e4e496d3a196a1ULL,0x769b9b5b2ded5bedULL,0x287575c2ea5dc25dULL,0xc5e1e11cd9241c24ULL,0xd43d3dae7ae9aee9ULL,0xf24c4c6a98be6abeULL,0x826c6c5ad8ee5aeeULL,0xbd7e7e41fcc341c3ULL,0xf3f5f502f1060206ULL,0x5283834f1dd14fd1ULL,0x8c68685cd0e45ce4ULL,0x565151f4a207f407ULL,0x8dd1d134b95c345cULL,0xe1f9f908e9180818ULL,0x4ce2e293dfae93aeULL,0x3eabab734d957395ULL,0x97626253c4f553f5ULL,0x6b2a2a3f54413f41ULL,0x1c08080c10140c14ULL,0x6395955231f652f6ULL,0xe94646658caf65afULL,0x7f9d9d5e21e25ee2ULL,0x4830302860782878ULL,0xcf3737a16ef8a1f8ULL,0x1b0a0a0f14110f11ULL,0xeb2f2fb55ec4b5c4ULL,0x150e0e091c1b091bULL,0x7e242436485a365aULL,0xad1b1b9b36b69bb6ULL,0x98dfdf3da5473d47ULL,0xa7cdcd26816a266aULL,0xf54e4e699cbb69bbULL,0x337f7fcdfe4ccd4cULL,0x50eaea9fcfba9fbaULL,0x3f12121b242d1b2dULL,0xa41d1d9e3ab99eb9ULL,0xc4585874b09c749cULL,0x4634342e68722e72ULL,0x4136362d6c772d77ULL,0x11dcdcb2a3cdb2cdULL,0x9db4b4ee7329ee29ULL,0x4d5b5bfbb616fb16ULL,0xa5a4a4f65301f601ULL,0xa176764decd74dd7ULL,0x14b7b76175a361a3ULL,0x347d7dcefa49ce49ULL,0xdf52527ba48d7b8dULL,0x9fdddd3ea1423e42ULL,0xcd5e5e71bc937193ULL,0xb113139726a297a2ULL,0xa2a6a6f55704f504ULL,0x01b9b96869b868b8ULL,0x0000000000000000ULL,0xb5c1c12c99742c74ULL,0xe040406080a060a0ULL,0xc2e3e31fdd211f21ULL,0x3a7979c8f243c843ULL,0x9ab6b6ed772ced2cULL,0x0dd4d4beb3d9bed9ULL,0x478d8d4601ca46caULL,0x176767d9ce70d970ULL,0xaf72724be4dd4bddULL,0xed9494de3379de79ULL,0xff9898d42b67d467ULL,0x93b0b0e87b23e823ULL,0x5b85854a11de4adeULL,0x06bbbb6b6dbd6bbdULL,0xbbc5c52a917e2a7eULL,0x7b4f4fe59e34e534ULL,0xd7eded16c13a163aULL,0xd28686c51754c554ULL,0xf89a9ad72f62d762ULL,0x99666655ccff55ffULL,0xb611119422a794a7ULL,0xc08a8acf0f4acf4aULL,0xd9e9e910c9301030ULL,0x0e040406080a060aULL,0x66fefe81e7988198ULL,0xaba0a0f05b0bf00bULL,0xb4787844f0cc44ccULL,0xf02525ba4ad5bad5ULL,0x754b4be3963ee33eULL,0xaca2a2f35f0ef30eULL,0x445d5dfeba19fe19ULL,0xdb8080c01b5bc05bULL,0x8005058a0a858a85ULL,0xd33f3fad7eecadecULL,0xfe2121bc42dfbcdfULL,0xa8707048e0d848d8ULL,0xfdf1f104f90c040cULL,0x196363dfc67adf7aULL,0x2f7777c1ee58c158ULL,0x30afaf75459f759fULL,0xe742426384a563a5ULL,0x7020203040503050ULL,0xcbe5e51ad12e1a2eULL,0xeffdfd0ee1120e12ULL,0x08bfbf6d65b76db7ULL,0x5581814c19d44cd4ULL,0x24181814303c143cULL,0x792626354c5f355fULL,0xb2c3c32f9d712f71ULL,0x86bebee16738e138ULL,0xc83535a26afda2fdULL,0xc78888cc0b4fcc4fULL,0x652e2e395c4b394bULL,0x6a9393573df957f9ULL,0x585555f2aa0df20dULL,0x61fcfc82e39d829dULL,0xb37a7a47f4c947c9ULL,0x27c8c8ac8befacefULL,0x88babae76f32e732ULL,0x4f32322b647d2b7dULL,0x42e6e695d7a495a4ULL,0x3bc0c0a09bfba0fbULL,0xaa19199832b398b3ULL,0xf69e9ed12768d168ULL,0x22a3a37f5d817f81ULL,0xee44446688aa66aaULL,0xd654547ea8827e82ULL,0xdd3b3bab76e6abe6ULL,0x950b0b83169e839eULL,0xc98c8cca0345ca45ULL,0xbcc7c729957b297bULL,0x056b6bd3d66ed36eULL,0x6c28283c50443c44ULL,0x2ca7a779558b798bULL,0x81bcbce2633de23dULL,0x3116161d2c271d27ULL,0x37adad76419a769aULL,0x96dbdb3bad4d3b4dULL,0x9e646456c8fa56faULL,0xa674744ee8d24ed2ULL,0x3614141e28221e22ULL,0xe49292db3f76db76ULL,0x120c0c0a181e0a1eULL,0xfc48486c90b46cb4ULL,0x8fb8b8e46b37e437ULL,0x789f9f5d25e75de7ULL,0x0fbdbd6e61b26eb2ULL,0x694343ef862aef2aULL,0x35c4c4a693f1a6f1ULL,0xda3939a872e3a8e3ULL,0xc63131a462f7a4f7ULL,0x8ad3d337bd593759ULL,0x74f2f28bff868b86ULL,0x83d5d532b1563256ULL,0x4e8b8b430dc543c5ULL,0x856e6e59dceb59ebULL,0x18dadab7afc2b7c2ULL,0x8e01018c028f8c8fULL,0x1db1b16479ac64acULL,0xf19c9cd2236dd26dULL,0x724949e0923be03bULL,0x1fd8d8b4abc7b4c7ULL,0xb9acacfa4315fa15ULL,0xfaf3f307fd090709ULL,0xa0cfcf25856f256fULL,0x20cacaaf8feaafeaULL,0x7df4f48ef3898e89ULL,0x674747e98e20e920ULL,0x3810101820281828ULL,0x0b6f6fd5de64d564ULL,0x73f0f088fb838883ULL,0xfb4a4a6f94b16fb1ULL,0xca5c5c72b8967296ULL,0x54383824706c246cULL,0x5f5757f1ae08f108ULL,0x217373c7e652c752ULL,0x6497975135f351f3ULL,0xaecbcb238d652365ULL,0x25a1a17c59847c84ULL,0x57e8e89ccbbf9cbfULL,0x5d3e3e217c632163ULL,0xea9696dd377cdd7cULL,0x1e6161dcc27fdc7fULL,0x9c0d0d861a918691ULL,0x9b0f0f851e948594ULL,0x4be0e090dbab90abULL,0xba7c7c42f8c642c6ULL,0x267171c4e257c457ULL,0x29ccccaa83e5aae5ULL,0xe39090d83b73d873ULL,0x090606050c0f050fULL,0xf4f7f701f5030103ULL,0x2a1c1c1238361236ULL,0x3cc2c2a39ffea3feULL,0x8b6a6a5fd4e15fe1ULL,0xbeaeaef94710f910ULL,0x026969d0d26bd06bULL,0xbf1717912ea891a8ULL,0x7199995829e858e8ULL,0x533a3a2774692769ULL,0xf72727b94ed0b9d0ULL,0x91d9d938a9483848ULL,0xdeebeb13cd351335ULL,0xe52b2bb356ceb3ceULL,0x7722223344553355ULL,0x04d2d2bbbfd6bbd6ULL,0x39a9a97049907090ULL,0x870707890e808980ULL,0xc13333a766f2a7f2ULL,0xec2d2db65ac1b6c1ULL,0x5a3c3c2278662266ULL,0xb81515922aad92adULL,0xa9c9c92089602060ULL,0x5c87874915db49dbULL,0xb0aaaaff4f1aff1aULL,0xd8505078a0887888ULL,0x2ba5a57a518e7a8eULL,0x8903038f068a8f8aULL,0x4a5959f8b213f813ULL,0x92090980129b809bULL,0x231a1a1734391739ULL,0x106565daca75da75ULL,0x84d7d731b5533153ULL,0xd58484c61351c651ULL,0x03d0d0b8bbd3b8d3ULL,0xdc8282c31f5ec35eULL,0xe22929b052cbb0cbULL,0xc35a5a77b4997799ULL,0x2d1e1e113c331133ULL,0x3d7b7bcbf646cb46ULL,0xb7a8a8fc4b1ffc1fULL,0x0c6d6dd6da61d661ULL,0x622c2c3a584e3a4eULL}; +__attribute__ ((aligned (16))) const u64 grsoT7[256] = +{0xc6c6a597f4a5f432ULL,0xf8f884eb9784976fULL,0xeeee99c7b099b05eULL,0xf6f68df78c8d8c7aULL,0xffff0de5170d17e8ULL,0xd6d6bdb7dcbddc0aULL,0xdedeb1a7c8b1c816ULL,0x91915439fc54fc6dULL,0x606050c0f050f090ULL,0x0202030405030507ULL,0xcecea987e0a9e02eULL,0x56567dac877d87d1ULL,0xe7e719d52b192bccULL,0xb5b56271a662a613ULL,0x4d4de69a31e6317cULL,0xecec9ac3b59ab559ULL,0x8f8f4505cf45cf40ULL,0x1f1f9d3ebc9dbca3ULL,0x89894009c040c049ULL,0xfafa87ef92879268ULL,0xefef15c53f153fd0ULL,0xb2b2eb7f26eb2694ULL,0x8e8ec90740c940ceULL,0xfbfb0bed1d0b1de6ULL,0x4141ec822fec2f6eULL,0xb3b3677da967a91aULL,0x5f5ffdbe1cfd1c43ULL,0x4545ea8a25ea2560ULL,0x2323bf46dabfdaf9ULL,0x5353f7a602f70251ULL,0xe4e496d3a196a145ULL,0x9b9b5b2ded5bed76ULL,0x7575c2ea5dc25d28ULL,0xe1e11cd9241c24c5ULL,0x3d3dae7ae9aee9d4ULL,0x4c4c6a98be6abef2ULL,0x6c6c5ad8ee5aee82ULL,0x7e7e41fcc341c3bdULL,0xf5f502f1060206f3ULL,0x83834f1dd14fd152ULL,0x68685cd0e45ce48cULL,0x5151f4a207f40756ULL,0xd1d134b95c345c8dULL,0xf9f908e9180818e1ULL,0xe2e293dfae93ae4cULL,0xabab734d9573953eULL,0x626253c4f553f597ULL,0x2a2a3f54413f416bULL,0x08080c10140c141cULL,0x95955231f652f663ULL,0x4646658caf65afe9ULL,0x9d9d5e21e25ee27fULL,0x3030286078287848ULL,0x3737a16ef8a1f8cfULL,0x0a0a0f14110f111bULL,0x2f2fb55ec4b5c4ebULL,0x0e0e091c1b091b15ULL,0x242436485a365a7eULL,0x1b1b9b36b69bb6adULL,0xdfdf3da5473d4798ULL,0xcdcd26816a266aa7ULL,0x4e4e699cbb69bbf5ULL,0x7f7fcdfe4ccd4c33ULL,0xeaea9fcfba9fba50ULL,0x12121b242d1b2d3fULL,0x1d1d9e3ab99eb9a4ULL,0x585874b09c749cc4ULL,0x34342e68722e7246ULL,0x36362d6c772d7741ULL,0xdcdcb2a3cdb2cd11ULL,0xb4b4ee7329ee299dULL,0x5b5bfbb616fb164dULL,0xa4a4f65301f601a5ULL,0x76764decd74dd7a1ULL,0xb7b76175a361a314ULL,0x7d7dcefa49ce4934ULL,0x52527ba48d7b8ddfULL,0xdddd3ea1423e429fULL,0x5e5e71bc937193cdULL,0x13139726a297a2b1ULL,0xa6a6f55704f504a2ULL,0xb9b96869b868b801ULL,0x0000000000000000ULL,0xc1c12c99742c74b5ULL,0x40406080a060a0e0ULL,0xe3e31fdd211f21c2ULL,0x7979c8f243c8433aULL,0xb6b6ed772ced2c9aULL,0xd4d4beb3d9bed90dULL,0x8d8d4601ca46ca47ULL,0x6767d9ce70d97017ULL,0x72724be4dd4bddafULL,0x9494de3379de79edULL,0x9898d42b67d467ffULL,0xb0b0e87b23e82393ULL,0x85854a11de4ade5bULL,0xbbbb6b6dbd6bbd06ULL,0xc5c52a917e2a7ebbULL,0x4f4fe59e34e5347bULL,0xeded16c13a163ad7ULL,0x8686c51754c554d2ULL,0x9a9ad72f62d762f8ULL,0x666655ccff55ff99ULL,0x11119422a794a7b6ULL,0x8a8acf0f4acf4ac0ULL,0xe9e910c9301030d9ULL,0x040406080a060a0eULL,0xfefe81e798819866ULL,0xa0a0f05b0bf00babULL,0x787844f0cc44ccb4ULL,0x2525ba4ad5bad5f0ULL,0x4b4be3963ee33e75ULL,0xa2a2f35f0ef30eacULL,0x5d5dfeba19fe1944ULL,0x8080c01b5bc05bdbULL,0x05058a0a858a8580ULL,0x3f3fad7eecadecd3ULL,0x2121bc42dfbcdffeULL,0x707048e0d848d8a8ULL,0xf1f104f90c040cfdULL,0x6363dfc67adf7a19ULL,0x7777c1ee58c1582fULL,0xafaf75459f759f30ULL,0x42426384a563a5e7ULL,0x2020304050305070ULL,0xe5e51ad12e1a2ecbULL,0xfdfd0ee1120e12efULL,0xbfbf6d65b76db708ULL,0x81814c19d44cd455ULL,0x181814303c143c24ULL,0x2626354c5f355f79ULL,0xc3c32f9d712f71b2ULL,0xbebee16738e13886ULL,0x3535a26afda2fdc8ULL,0x8888cc0b4fcc4fc7ULL,0x2e2e395c4b394b65ULL,0x9393573df957f96aULL,0x5555f2aa0df20d58ULL,0xfcfc82e39d829d61ULL,0x7a7a47f4c947c9b3ULL,0xc8c8ac8befacef27ULL,0xbabae76f32e73288ULL,0x32322b647d2b7d4fULL,0xe6e695d7a495a442ULL,0xc0c0a09bfba0fb3bULL,0x19199832b398b3aaULL,0x9e9ed12768d168f6ULL,0xa3a37f5d817f8122ULL,0x44446688aa66aaeeULL,0x54547ea8827e82d6ULL,0x3b3bab76e6abe6ddULL,0x0b0b83169e839e95ULL,0x8c8cca0345ca45c9ULL,0xc7c729957b297bbcULL,0x6b6bd3d66ed36e05ULL,0x28283c50443c446cULL,0xa7a779558b798b2cULL,0xbcbce2633de23d81ULL,0x16161d2c271d2731ULL,0xadad76419a769a37ULL,0xdbdb3bad4d3b4d96ULL,0x646456c8fa56fa9eULL,0x74744ee8d24ed2a6ULL,0x14141e28221e2236ULL,0x9292db3f76db76e4ULL,0x0c0c0a181e0a1e12ULL,0x48486c90b46cb4fcULL,0xb8b8e46b37e4378fULL,0x9f9f5d25e75de778ULL,0xbdbd6e61b26eb20fULL,0x4343ef862aef2a69ULL,0xc4c4a693f1a6f135ULL,0x3939a872e3a8e3daULL,0x3131a462f7a4f7c6ULL,0xd3d337bd5937598aULL,0xf2f28bff868b8674ULL,0xd5d532b156325683ULL,0x8b8b430dc543c54eULL,0x6e6e59dceb59eb85ULL,0xdadab7afc2b7c218ULL,0x01018c028f8c8f8eULL,0xb1b16479ac64ac1dULL,0x9c9cd2236dd26df1ULL,0x4949e0923be03b72ULL,0xd8d8b4abc7b4c71fULL,0xacacfa4315fa15b9ULL,0xf3f307fd090709faULL,0xcfcf25856f256fa0ULL,0xcacaaf8feaafea20ULL,0xf4f48ef3898e897dULL,0x4747e98e20e92067ULL,0x1010182028182838ULL,0x6f6fd5de64d5640bULL,0xf0f088fb83888373ULL,0x4a4a6f94b16fb1fbULL,0x5c5c72b8967296caULL,0x383824706c246c54ULL,0x5757f1ae08f1085fULL,0x7373c7e652c75221ULL,0x97975135f351f364ULL,0xcbcb238d652365aeULL,0xa1a17c59847c8425ULL,0xe8e89ccbbf9cbf57ULL,0x3e3e217c6321635dULL,0x9696dd377cdd7ceaULL,0x6161dcc27fdc7f1eULL,0x0d0d861a9186919cULL,0x0f0f851e9485949bULL,0xe0e090dbab90ab4bULL,0x7c7c42f8c642c6baULL,0x7171c4e257c45726ULL,0xccccaa83e5aae529ULL,0x9090d83b73d873e3ULL,0x0606050c0f050f09ULL,0xf7f701f5030103f4ULL,0x1c1c12383612362aULL,0xc2c2a39ffea3fe3cULL,0x6a6a5fd4e15fe18bULL,0xaeaef94710f910beULL,0x6969d0d26bd06b02ULL,0x1717912ea891a8bfULL,0x99995829e858e871ULL,0x3a3a277469276953ULL,0x2727b94ed0b9d0f7ULL,0xd9d938a948384891ULL,0xebeb13cd351335deULL,0x2b2bb356ceb3cee5ULL,0x2222334455335577ULL,0xd2d2bbbfd6bbd604ULL,0xa9a9704990709039ULL,0x0707890e80898087ULL,0x3333a766f2a7f2c1ULL,0x2d2db65ac1b6c1ecULL,0x3c3c22786622665aULL,0x1515922aad92adb8ULL,0xc9c92089602060a9ULL,0x87874915db49db5cULL,0xaaaaff4f1aff1ab0ULL,0x505078a0887888d8ULL,0xa5a57a518e7a8e2bULL,0x03038f068a8f8a89ULL,0x5959f8b213f8134aULL,0x090980129b809b92ULL,0x1a1a173439173923ULL,0x6565daca75da7510ULL,0xd7d731b553315384ULL,0x8484c61351c651d5ULL,0xd0d0b8bbd3b8d303ULL,0x8282c31f5ec35edcULL,0x2929b052cbb0cbe2ULL,0x5a5a77b4997799c3ULL,0x1e1e113c3311332dULL,0x7b7bcbf646cb463dULL,0xa8a8fc4b1ffc1fb7ULL,0x6d6dd6da61d6610cULL,0x2c2c3a584e3a4e62ULL}; + +#endif /* __tables_h */ diff --git a/algo/lyra2/zoin.c b/algo/lyra2/zoin.c index f728417..8b08d13 100644 --- a/algo/lyra2/zoin.c +++ b/algo/lyra2/zoin.c @@ -84,7 +84,7 @@ bool zoin_thread_init() return true; } -bool register_zoin_algo( algo_gate_t* gate ) +bool register_lyra2z330_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->miner_thread_init = (void*)&zoin_thread_init; diff --git a/algo/nist5.c b/algo/nist5.c index 4045a68..75769de 100644 --- a/algo/nist5.c +++ b/algo/nist5.c @@ -17,7 +17,10 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" #endif @@ -43,7 +46,7 @@ void init_nist5_ctx() void nist5hash(void *output, const void *input) { size_t hashptr; - unsigned char hashbuf[128]; + unsigned char hashbuf[128] __attribute__ ((aligned (32))); sph_u64 hashctA; sph_u64 hashctB; unsigned char hash[128]; @@ -59,8 +62,12 @@ void nist5hash(void *output, const void *input) BLK_C; #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512 (&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/quark/quark.c b/algo/quark/quark.c index ae71d06..8428505 100644 --- a/algo/quark/quark.c +++ b/algo/quark/quark.c @@ -19,7 +19,10 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" #endif @@ -55,7 +58,7 @@ inline static void quarkhash(void *state, const void *input) sph_u64 hashctA; sph_u64 hashctB; int i; - unsigned char hash[128]; + unsigned char hash[128] __attribute__ ((aligned (32))); #ifdef NO_AES_NI sph_groestl512_context ctx; #else @@ -113,9 +116,13 @@ inline static void quarkhash(void *state, const void *input) { #ifdef NO_AES_NI - sph_groestl512_init( &ctx ); - sph_groestl512 ( &ctx, hash, 64 ); - sph_groestl512_close( &ctx, hash ); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512_init( &ctx ); +// sph_groestl512 ( &ctx, hash, 64 ); +// sph_groestl512_close( &ctx, hash ); #else reinit_groestl( &ctx ); update_groestl( &ctx, (char*)hash, 512 ); diff --git a/algo/sha2/sha256t.c b/algo/sha2/sha256t.c new file mode 100644 index 0000000..6a8818f --- /dev/null +++ b/algo/sha2/sha256t.c @@ -0,0 +1,143 @@ +#include "miner.h" +#include "algo-gate-api.h" + +#include +#include +#include +#include + +#include "sph-sha2.h" + +//#define DEBUG_ALGO + + +static sph_sha256_context sha256t_ctx __attribute__ ((aligned (64))); +static __thread sph_sha256_context sha256t_mid __attribute__ ((aligned (64))); + +void sha256t_midstate( const void* input ) +{ + memcpy( &sha256t_mid, &sha256t_ctx, sizeof sha256t_mid ); + sph_sha256( &sha256t_mid, input, 64 ); +} + +void sha256t_hash(void* output, const void* input, uint32_t len) +{ + sph_sha256_context ctx_sha256 __attribute__ ((aligned (64))); + uint32_t _ALIGN(64) hashA[16]; + + const int midlen = 64; // bytes + const int tail = 80 - midlen; // 16 + + memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid ); + sph_sha256( &ctx_sha256, input + midlen, tail ); + +// sph_sha256_init(&ctx_sha256); +// sph_sha256 (&ctx_sha256, input, 80); + sph_sha256_close( &ctx_sha256, hashA ); + +// sph_sha256_init(&ctx_sha256); + memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx ); + sph_sha256( &ctx_sha256, hashA, 32 ); + sph_sha256_close( &ctx_sha256, hashA ); + +// sph_sha256_init(&ctx_sha256); + memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx ); + sph_sha256( &ctx_sha256, hashA, 32 ); + sph_sha256_close( &ctx_sha256, hashA ); + + memcpy( output, hashA, 32 ); +} + +int scanhash_sha256t(int thr_id, struct work *work, + uint32_t max_nonce, uint64_t *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t len = 80; + + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; +#ifdef _MSC_VER + uint32_t __declspec(align(32)) hash64[8]; +#else + uint32_t hash64[8] __attribute__((aligned(32))); +#endif + uint32_t endiandata[32]; + + uint64_t htmax[] = { + 0, + 0xF, + 0xFF, + 0xFFF, + 0xFFFF, + 0x10000000 + }; + uint32_t masks[] = { + 0xFFFFFFFF, + 0xFFFFFFF0, + 0xFFFFFF00, + 0xFFFFF000, + 0xFFFF0000, + 0 + }; + + // we need bigendian data... + for (int k = 0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + sha256t_midstate( endiandata ); + +#ifdef DEBUG_ALGO + if (Htarg != 0) + printf("[%d] Htarg=%X\n", thr_id, Htarg); +#endif + for (int m=0; m < 6; m++) { + if (Htarg <= htmax[m]) { + uint32_t mask = masks[m]; + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + sha256t_hash(hash64, endiandata, len); +#ifndef DEBUG_ALGO + if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } +#else + if (!(n % 0x1000) && !thr_id) printf("."); + if (!(hash64[7] & mask)) { + printf("[%d]",thr_id); + if (fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } +#endif + } while (n < max_nonce && !work_restart[thr_id].restart); + // see blake.c if else to understand the loop on htmax => mask + break; + } + } + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +void sha256t_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + + +bool register_sha256t_algo( algo_gate_t* gate ) +{ + sph_sha256_init( &sha256t_ctx ); + gate->scanhash = (void*)&scanhash_sha256t; + gate->hash = (void*)&sha256t_hash; + gate->hash_alt = (void*)&sha256t_hash; + gate->set_target = (void*)&sha256t_set_target; + gate->get_max64 = (void*)&get_max64_0x3ffff; + return true; +} diff --git a/algo/x11/x11.c b/algo/x11/x11.c index a806c3c..b32935f 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -17,7 +17,10 @@ #include "algo/simd/sph_simd.h" #include "algo/echo/sph_echo.h" -#ifndef NO_AES_NI +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -64,7 +67,7 @@ void init_x11_ctx() static void x11_hash( void *state, const void *input ) { - unsigned char hash[128] __attribute__ ((aligned (16))); + unsigned char hash[128] __attribute__ ((aligned (32))); unsigned char hashbuf[128] __attribute__ ((aligned (16))); sph_u64 hashctA; sph_u64 hashctB; @@ -89,8 +92,12 @@ static void x11_hash( void *state, const void *input ) #undef dH #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512 (&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash, 512 ); final_groestl( &ctx.groestl, (char*)hash ); diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c index 3418412..ff3a1c0 100644 --- a/algo/x11/x11evo.c +++ b/algo/x11/x11evo.c @@ -160,7 +160,7 @@ static void evo_twisted_code(uint32_t ntime, char *permstr) static inline void x11evo_hash( void *state, const void *input ) { - uint32_t hash[16]; + uint32_t hash[16] __attribute__ ((aligned (32))); x11evo_ctx_holder ctx; memcpy( &ctx, &x11evo_ctx, sizeof(x11evo_ctx) ); diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index 7a9aad7..a6dee96 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -20,7 +20,10 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -61,7 +64,7 @@ void init_sib_ctx() void sibhash(void *output, const void *input) { - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; + unsigned char hash[128] __attribute__ ((aligned (32))); #define hashA hash #define hashB hash+64 @@ -90,8 +93,12 @@ void sibhash(void *output, const void *input) #undef dH #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512 (&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/x13/x13.c b/algo/x13/x13.c index 65d0cda..ca0fae2 100644 --- a/algo/x13/x13.c +++ b/algo/x13/x13.c @@ -29,7 +29,10 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -71,7 +74,7 @@ void init_x13_ctx() static void x13hash(void *output, const void *input) { - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; + unsigned char hash[128] __attribute__ ((aligned (32))); #define hashB hash+64 x13_ctx_holder ctx; @@ -110,8 +113,12 @@ static void x13hash(void *output, const void *input) //---groetl---- #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512 (&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/x14/x14.c b/algo/x14/x14.c index ff55d3c..123f2fd 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -31,7 +31,10 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -75,7 +78,7 @@ void init_x14_ctx() static void x14hash(void *output, const void *input) { - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; + unsigned char hash[128] __attribute__ ((aligned (32))); #define hashB hash+64 x14_ctx_holder ctx; @@ -112,8 +115,12 @@ static void x14hash(void *output, const void *input) //---groestl---- #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512 (&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/x15/x15.c b/algo/x15/x15.c index 4f57a10..0c4dc92 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -31,9 +31,12 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI - #include "algo/echo/aes_ni/hash_api.h" +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" + #include "algo/echo/aes_ni/hash_api.h" #endif typedef struct { @@ -77,7 +80,7 @@ void init_x15_ctx() static void x15hash(void *output, const void *input) { - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; + unsigned char hash[128] __attribute__ ((aligned (32))); #define hashB hash+64 x15_ctx_holder ctx; @@ -113,8 +116,12 @@ static void x15hash(void *output, const void *input) //---groestl---- #ifdef NO_AES_NI - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512(&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/x17/x17.c b/algo/x17/x17.c index 8ddecec..e1102c8 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -33,9 +33,12 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI - #include "algo/echo/aes_ni/hash_api.h" +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" + #include "algo/echo/aes_ni/hash_api.h" #endif typedef struct { @@ -83,7 +86,7 @@ void init_x17_ctx() static void x17hash(void *output, const void *input) { - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; + unsigned char hash[128] __attribute__ ((aligned (32))); #define hashB hash+64 x17_ctx_holder ctx; @@ -119,8 +122,12 @@ static void x17hash(void *output, const void *input) //---groestl---- #ifdef NO_AES_NI - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; +// sph_groestl512(&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/zr5.c b/algo/zr5.c index 85aea47..732a5b1 100644 --- a/algo/zr5.c +++ b/algo/zr5.c @@ -35,7 +35,10 @@ #include "algo/groestl/sph_groestl.h" #include "algo/keccak/sph_keccak.h" -#ifndef NO_AES_NI +#ifdef NO_AES_NI + #include "algo/groestl/sse2/grso.h" + #include "algo/groestl/sse2/grso-macro.c" +#else #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -83,7 +86,7 @@ static void zr5hash(void *state, const void *input) { DATA_ALIGN16(unsigned char hashbuf[128]); -DATA_ALIGN16(unsigned char hash[128]); +unsigned char hash[128] __attribute__ ((aligned (32))); DATA_ALIGN16(size_t hashptr); DATA_ALIGN16(sph_u64 hashctA); DATA_ALIGN16(sph_u64 hashctB); @@ -121,8 +124,14 @@ static const int arrOrder[][4] = break; case 1: #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); + { + grsoState sts_grs; + GRS_I; + GRS_U; + GRS_C; + } +// sph_groestl512 (&ctx.groestl, hash, 64); +// sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/build.sh b/build.sh index 219edb9..23b3600 100755 --- a/build.sh +++ b/build.sh @@ -20,6 +20,6 @@ rm -f config.status CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl -make -j 4 +make strip -s cpuminer diff --git a/configure.ac b/configure.ac index 02b0004..4944c16 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.5.9]) +AC_INIT([cpuminer-opt], [3.5.9.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/miner.h b/miner.h index 7d559e6..9b75cbf 100644 --- a/miner.h +++ b/miner.h @@ -47,14 +47,14 @@ # endif #endif - +/* #ifndef min #define min(a,b) (a>b ? (b) :(a)) #endif #ifndef max #define max(a,b) (a @@ -499,7 +499,7 @@ enum algos { ALGO_LYRA2RE, ALGO_LYRA2REV2, ALGO_LYRA2Z, - ALGO_LYRA2ZOIN, + ALGO_LYRA2Z330, ALGO_M7M, ALGO_MYR_GR, ALGO_NEOSCRYPT, @@ -511,6 +511,7 @@ enum algos { ALGO_SCRYPT, ALGO_SCRYPTJANE, ALGO_SHA256D, + ALGO_SHA256T, ALGO_SHAVITE3, ALGO_SKEIN, ALGO_SKEIN2, @@ -559,7 +560,7 @@ static const char* const algo_names[] = { "lyra2re", "lyra2rev2", "lyra2z", - "lyra2zoin", + "lyra2z330", "m7m", "myr-gr", "neoscrypt", @@ -571,6 +572,7 @@ static const char* const algo_names[] = { "scrypt", "scryptjane", "sha256d", + "sha256t", "shavite3", "skein", "skein2", @@ -657,14 +659,14 @@ Options:\n\ "/* blake2b Sia\n*/"\ blake2s Blake-2 S\n\ bmw BMW 256\n\ - c11 Flax\n\ + c11 Chaincoin\n\ cryptolight Cryptonight-light\n\ cryptonight cryptonote, Monero (XMR)\n\ decred\n\ deep Deepcoin (DCN)\n\ drop Dropcoin\n\ fresh Fresh\n\ - groestl groestl\n\ + groestl dmd-gr, Groestl coin\n\ heavy Heavy\n\ hmq1725 Espers\n\ hodl Hodlcoin\n\ @@ -672,9 +674,9 @@ Options:\n\ lbry LBC, LBRY Credits\n\ luffa Luffa\n\ lyra2re lyra2\n\ - lyra2rev2 lyrav2\n\ + lyra2rev2 lyrav2, Vertcoin\n\ lyra2z Zcoin (XZC)\n\ - lyra2zoin Zoin (ZOI)\n\ + lyra2z330 Zoin (ZOI)\n\ m7m Magi (XMG)\n\ myr-gr Myriad-Groestl\n\ neoscrypt NeoScrypt(128, 2, 1)\n\ @@ -686,7 +688,8 @@ Options:\n\ scrypt scrypt(1024, 1, 1) (default)\n\ scrypt:N scrypt(N, 1, 1)\n\ scryptjane:nf\n\ - sha256d SHA-256d\n\ + sha256d Double SHA-256\n\ + sha256t Triple SHA-256, Onecoin (OC)\n\ shavite3 Shavite3\n\ skein Skein+Sha (Skeincoin)\n\ skein2 Double Skein (Woodcoin)\n\ @@ -695,7 +698,7 @@ Options:\n\ veltor\n\ whirlpool\n\ whirlpoolx\n\ - x11 X11\n\ + x11 Dash\n\ x11evo Revolvercoin\n\ x11gost sib (SibCoin)\n\ x13 X13\n\ diff --git a/winbuild-allarch.sh b/winbuild-allarch.sh new file mode 100755 index 0000000..bbb3d81 --- /dev/null +++ b/winbuild-allarch.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +make distclean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 -march=core-avx2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl +make -j 4 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-aes-avx2.exe + +make clean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 -march=corei7-avx -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl +make -j 4 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-aes-avx.exe + +make clean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 -maes -msse4.2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl +make -j 4 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-aes-sse42.exe + +make clean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 -march=corei7 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl +make -j 4 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-sse42.exe + +make clean || echo clean +rm -f config.status +./autogen.sh || echo done +CFLAGS="-O3 -march=core2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl +make -j 4 +strip -s cpuminer.exe +mv cpuminer.exe cpuminer-sse2.exe + +make clean || echo done + + diff --git a/winbuild.sh b/winbuild.sh index 0107da3..4378c1c 100755 --- a/winbuild.sh +++ b/winbuild.sh @@ -20,6 +20,6 @@ rm -f config.status CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl -make -j 4 +make strip -s cpuminer