From 0681ca996d8d864d65ea03ec77b9184bf4f820ed Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Thu, 30 Jan 2020 03:47:11 -0500 Subject: [PATCH] v3.11.8 --- RELEASE_NOTES | 14 +- algo/cubehash/cubehash_sse2.c | 89 +- algo/cubehash/cubehash_sse2.h | 5 +- algo/groestl/aes_ni/groestl-asm-aes.h | 1043 -------------- algo/groestl/aes_ni/groestl-asm-avx.h | 1105 --------------- algo/groestl/aes_ni/groestl-asm-vperm.h | 1397 ------------------- algo/groestl/aes_ni/groestl-intr-aes.h | 222 +-- algo/groestl/aes_ni/groestl-intr-avx.h | 1072 -------------- algo/groestl/aes_ni/groestl-intr-vperm.h | 1294 ----------------- algo/groestl/aes_ni/groestl-version.h | 10 - algo/groestl/aes_ni/groestl256-asm-aes.h | 529 ------- algo/groestl/aes_ni/groestl256-asm-avx.h | 519 ------- algo/groestl/aes_ni/groestl256-asm-vperm.h | 856 ------------ algo/groestl/aes_ni/groestl256-intr-aes.h | 121 +- algo/groestl/aes_ni/groestl256-intr-avx.h | 482 ------- algo/groestl/aes_ni/groestl256-intr-vperm.h | 793 ----------- algo/groestl/aes_ni/hash-groestl.c | 112 +- algo/groestl/aes_ni/hash-groestl256.c | 37 +- algo/keccak/keccak-4way.c | 4 +- algo/keccak/keccak-gate.c | 2 +- algo/keccak/sha3d-4way.c | 12 +- algo/luffa/luffa_for_sse2.c | 63 +- algo/luffa/luffa_for_sse2.h | 3 +- algo/lyra2/allium-4way.c | 10 +- algo/lyra2/lyra2-gate.c | 2 +- algo/lyra2/lyra2-gate.h | 2 +- algo/simd/nist.c | 124 +- algo/simd/nist.h | 4 +- algo/skein/skein-hash-4way.c | 13 +- algo/x11/tribus-4way.c | 4 +- algo/x16/hex.c | 165 ++- algo/x16/x16r-4way.c | 608 +++++--- algo/x16/x16r-gate.c | 2 + algo/x16/x16r-gate.h | 2 +- algo/x16/x16rt-4way.c | 786 +++++++---- algo/x16/x16rv2-4way.c | 1009 +++++++++----- algo/x16/x21s-4way.c | 786 +++++++---- algo/x17/x17-4way.c | 6 +- algo/x17/x17.c | 64 +- algo/yespower/yescrypt-r8g.c | 3 +- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 102 +- miner.h | 4 +- simd-utils/intrlv.h | 34 +- util.c | 21 +- 46 files changed, 2882 insertions(+), 10675 deletions(-) delete mode 100644 algo/groestl/aes_ni/groestl-asm-aes.h delete mode 100644 algo/groestl/aes_ni/groestl-asm-avx.h delete mode 100644 algo/groestl/aes_ni/groestl-asm-vperm.h delete mode 100644 algo/groestl/aes_ni/groestl-intr-avx.h delete mode 100644 algo/groestl/aes_ni/groestl-intr-vperm.h delete mode 100644 algo/groestl/aes_ni/groestl-version.h delete mode 100644 algo/groestl/aes_ni/groestl256-asm-aes.h delete mode 100644 algo/groestl/aes_ni/groestl256-asm-avx.h delete mode 100644 algo/groestl/aes_ni/groestl256-asm-vperm.h delete mode 100644 algo/groestl/aes_ni/groestl256-intr-avx.h delete mode 100644 algo/groestl/aes_ni/groestl256-intr-vperm.h diff --git a/RELEASE_NOTES b/RELEASE_NOTES index aa6938b..1a90761 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,9 +65,21 @@ If not what makes it happen or not happen? Change Log ---------- +v3.11.8 + +Fixed network hashrate showing incorrect data, should be close now. + +Fixed compile errors when using GCC 10 with default flag -fno-common. + +Faster x16r, x16rv2, x16rt, x16s, x21s, veil, hex with midstate prehash. + +Decoupled sapling usage from block version 5 in yescryptr8g. + +More detailed data reporting for low difficulty rejected shares. + v3.11.7 -Added yescryptr8g algo fotr KOTO, including support for block version 5. +Added yescryptr8g algo for KOTO, including support for block version 5. Added sha3d algo for BSHA3. diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index c508248..c87829d 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -230,11 +230,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest ) // pos is zero for 64 byte data, 1 for 80 byte data. sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], - _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ) ); + m128_const_64( 0, 0x80 ) ); transform( sp ); - sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) ); + sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) ); transform( sp ); transform( sp ); transform( sp ); @@ -276,11 +275,89 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest, // pos is zero for 64 byte data, 1 for 80 byte data. sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], - _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ) ); + m128_const_64( 0, 0x80 ) ); transform( sp ); - sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) ); + sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) ); + + transform( sp ); + transform( sp ); + transform( sp ); + transform( sp ); + transform( sp ); + transform( sp ); + transform( sp ); + transform( sp ); + transform( sp ); + transform( sp ); + + for ( i = 0; i < sp->hashlen; i++ ) + hash[i] = sp->x[i]; + + return SUCCESS; +} + +int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen, + const byte *data, size_t size ) +{ + __m128i *x = (__m128i*)sp->x; + sp->hashlen = hashbitlen/128; + sp->blocksize = 32/16; + sp->rounds = 16; + sp->pos = 0; + + if ( hashbitlen == 512 ) + { + + x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); + x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); + x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); + x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); + x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); + x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); + x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); + x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); + } + else + { + x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 ); + x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B ); + x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 ); + x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 ); + x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 ); + x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 ); + x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE ); + x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB ); + } + + + + + const int len = size / 16; + const __m128i* in = (__m128i*)data; + __m128i* hash = (__m128i*)digest; + int i; + + // It is assumed data is aligned to 256 bits and is a multiple of 128 bits. + // Current usage sata is either 64 or 80 bytes. + + for ( i = 0; i < len; i++ ) + { + sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] ); + sp->pos++; + if ( sp->pos == sp->blocksize ) + { + transform( sp ); + sp->pos = 0; + } + } + + // pos is zero for 64 byte data, 1 for 80 byte data. + sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], + m128_const_64( 0, 0x80 ) ); + transform( sp ); + + sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) ); transform( sp ); transform( sp ); diff --git a/algo/cubehash/cubehash_sse2.h b/algo/cubehash/cubehash_sse2.h index 4e1eaa3..69da618 100644 --- a/algo/cubehash/cubehash_sse2.h +++ b/algo/cubehash/cubehash_sse2.h @@ -19,7 +19,7 @@ struct _cubehashParam int rounds; int blocksize; // __m128i int pos; // number of __m128i read into x from current block - __m128i _ALIGN(256) x[8]; // aligned for __m256i + __m128i _ALIGN(64) x[8]; // aligned for __m256i }; typedef struct _cubehashParam cubehashParam; @@ -39,6 +39,9 @@ int cubehashDigest(cubehashParam* sp, byte *digest); int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data, size_t size ); +int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen, + const byte *data, size_t size ); + #ifdef __cplusplus } #endif diff --git a/algo/groestl/aes_ni/groestl-asm-aes.h b/algo/groestl/aes_ni/groestl-asm-aes.h deleted file mode 100644 index c4e44a4..0000000 --- a/algo/groestl/aes_ni/groestl-asm-aes.h +++ /dev/null @@ -1,1043 +0,0 @@ -/* groestl-asm-aes.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3, sse4.1, and aes - * instructions. - * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * This code is placed in the public domain - */ - -#include "hash-groestl.h" -/* global constants */ -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; -__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; -__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; -__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; - -/* temporary variables */ -__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP[3*16]; - - -#define tos(a) #a -#define tostr(a) tos(a) - - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b */ -#define MUL2(i, j, k){\ - asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\ - asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\ - asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\ - asm("pand xmm"tostr(j)", xmm"tostr(k)"");\ - asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* t_i = a_i + a_{i+1} */\ - asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ - asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\ - asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\ - asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\ - asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\ - asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\ - asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\ - asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\ - /* spill values y_4, y_5 to memory */\ - asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\ - asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ - asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ - asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* compute x_i = t_i + t_{i+3} */\ - asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\ - \ - /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - /* compute w_i : add y_{i+4} */\ - asm("movaps xmm"tostr(b1)", [ALL_1B]");\ - MUL2(a0, b0, b1);\ - asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\ - MUL2(a1, b0, b1);\ - asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\ - MUL2(a2, b0, b1);\ - asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\ - MUL2(a3, b0, b1);\ - asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\ - MUL2(a4, b0, b1);\ - asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\ - MUL2(a5, b0, b1);\ - asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\ - MUL2(a6, b0, b1);\ - asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\ - MUL2(a7, b0, b1);\ - asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\ - \ - /* compute v_i : double w_i */\ - /* add to y_4 y_5 .. v3, v4, ... */\ - MUL2(a0, b0, b1);\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ - MUL2(a1, b0, b1);\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\ - MUL2(a2, b0, b1);\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ - MUL2(a5, b0, b1);\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\ - MUL2(a6, b0, b1);\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ - MUL2(a7, b0, b1);\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ - MUL2(a3, b0, b1);\ - MUL2(a4, b0, b1);\ - asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\ - asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ -}/*MixBytes*/ - -#if (LENGTH <= 256) - -#define SET_CONSTANTS(){\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ - ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ - ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}while(0); - -#define Push_All_Regs() do{\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}while(0); - -#define Pop_All_Regs() do{\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}while(0); - -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant */\ - asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ - asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ - /* ShiftBytes + SubBytes (interleaved) */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ - asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ - asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ - asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ - asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ - asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ - asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ - asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ - asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ -} - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ - \ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ - \ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ - \ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ - \ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - \ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ - \ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ - asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm12 - xmm15 */ - asm ("movaps xmm12, [rdi+0*16]"); - asm ("movaps xmm13, [rdi+1*16]"); - asm ("movaps xmm14, [rdi+2*16]"); - asm ("movaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm12"); - asm ("movaps [rdi+1*16], xmm2"); - asm ("movaps [rdi+2*16], xmm6"); - asm ("movaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("movaps xmm12, [rsi+0*16]"); - asm ("movaps xmm13, [rsi+1*16]"); - asm ("movaps xmm14, [rsi+2*16]"); - asm ("movaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm0, [rdi+1*16]"); - asm ("movaps xmm4, [rdi+2*16]"); - asm ("movaps xmm5, [rdi+3*16]"); - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("pxor xmm8, xmm12"); - asm ("pxor xmm0, xmm2"); - asm ("pxor xmm4, xmm6"); - asm ("pxor xmm5, xmm7"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("pxor xmm0, xmm8"); - asm ("pxor xmm1, xmm10"); - asm ("pxor xmm2, xmm12"); - asm ("pxor xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("pxor xmm0, [rdi+0*16]"); - asm ("pxor xmm1, [rdi+1*16]"); - asm ("pxor xmm2, [rdi+2*16]"); - asm ("pxor xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm0"); - asm ("movaps [rdi+1*16], xmm1"); - asm ("movaps [rdi+2*16], xmm2"); - asm ("movaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm10, [rdi+1*16]"); - asm ("movaps xmm12, [rdi+2*16]"); - asm ("movaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm10, [rdi+1*16]"); - asm ("pxor xmm12, [rdi+2*16]"); - asm ("pxor xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+2*16], xmm9"); - asm ("movaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - -#if (LENGTH > 256) - -#define SET_CONSTANTS(){\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ - ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\ - ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\ - ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\ - ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\ - for(i = 0; i < ROUNDS1024; i++)\ - {\ - ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ - ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ - ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - }\ -}while(0); - -#define Push_All_Regs() do{\ - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");\ -}while(0); - -#define Pop_All_Regs() do{\ - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");\ -}while(0); - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* SubBytes */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -#define ROUNDS_P(){\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("1:");\ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm9, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - \ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 1b");\ -} - -#define ROUNDS_Q(){\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("2:");\ - /* AddRoundConstant Q1024 */\ - asm ("movaps xmm1, [ALL_FF]");\ - asm ("pxor xmm8, xmm1");\ - asm ("pxor xmm9, xmm1");\ - asm ("pxor xmm10, xmm1");\ - asm ("pxor xmm11, xmm1");\ - asm ("pxor xmm12, xmm1");\ - asm ("pxor xmm13, xmm1");\ - asm ("pxor xmm14, xmm1");\ - asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm9, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\ - asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - \ - /* AddConstant */\ - asm ("movaps xmm9, [ALL_FF]");\ - asm ("pxor xmm0, xmm9");\ - asm ("pxor xmm1, xmm9");\ - asm ("pxor xmm2, xmm9");\ - asm ("pxor xmm3, xmm9");\ - asm ("pxor xmm4, xmm9");\ - asm ("pxor xmm5, xmm9");\ - asm ("pxor xmm6, xmm9");\ - asm ("pxor xmm7, [ROUND_CONST_Q+ebx*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\ - asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 2b");\ -} - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ - \ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\ - \ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\ - asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\ - asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - \ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ - \ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(t4)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t0)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(t5)", xmm"tostr(t1)"");\ - asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\ - asm ("punpckhdq xmm"tostr(t6)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t2)", xmm"tostr(t3)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(t7)", xmm"tostr(t3)"");\ - \ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - /* transpose matrix to get output format */\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\ - asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\ - /* load transpose mask into a register, because it will be used 8 times */\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\ - \ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\ - asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\ - asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\ - asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\ - asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\ - asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\ - /* transpose done */\ -}/**/ - - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* transform chaining value from column ordering into row ordering */ - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF1024(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm8 - xmm15 (Q = message) */ - asm ("movaps xmm8, [rsi+0*16]"); - asm ("movaps xmm9, [rsi+1*16]"); - asm ("movaps xmm10, [rsi+2*16]"); - asm ("movaps xmm11, [rsi+3*16]"); - asm ("movaps xmm12, [rsi+4*16]"); - asm ("movaps xmm13, [rsi+5*16]"); - asm ("movaps xmm14, [rsi+6*16]"); - asm ("movaps xmm15, [rsi+7*16]"); - - /* transform message M from column ordering into row ordering */ - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store message M (Q input) for later */ - asm ("movaps [QTEMP+0*16], xmm8"); - asm ("movaps [QTEMP+1*16], xmm9"); - asm ("movaps [QTEMP+2*16], xmm10"); - asm ("movaps [QTEMP+3*16], xmm11"); - asm ("movaps [QTEMP+4*16], xmm12"); - asm ("movaps [QTEMP+5*16], xmm13"); - asm ("movaps [QTEMP+6*16], xmm14"); - asm ("movaps [QTEMP+7*16], xmm15"); - - /* xor CV to message to get P input */ - /* result: CV+M in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV+M) in xmm8...xmm15 */ - ROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV+M)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store P(CV+M)+CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - /* load message M (Q input) into xmm8-15 */ - asm ("movaps xmm8, [QTEMP+0*16]"); - asm ("movaps xmm9, [QTEMP+1*16]"); - asm ("movaps xmm10, [QTEMP+2*16]"); - asm ("movaps xmm11, [QTEMP+3*16]"); - asm ("movaps xmm12, [QTEMP+4*16]"); - asm ("movaps xmm13, [QTEMP+5*16]"); - asm ("movaps xmm14, [QTEMP+6*16]"); - asm ("movaps xmm15, [QTEMP+7*16]"); - - /* compute permutation Q */ - /* result: Q(M) in xmm8...xmm15 */ - ROUNDS_Q(); - - /* xor Q output */ - /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF1024(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV) in xmm8...xmm15 */ - ROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+4*16], xmm0"); - asm ("movaps [rdi+5*16], xmm6"); - asm ("movaps [rdi+6*16], xmm13"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - diff --git a/algo/groestl/aes_ni/groestl-asm-avx.h b/algo/groestl/aes_ni/groestl-asm-avx.h deleted file mode 100644 index 6e8be1b..0000000 --- a/algo/groestl/aes_ni/groestl-asm-avx.h +++ /dev/null @@ -1,1105 +0,0 @@ -/* groestl-asm-avx.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx - * instructions. - * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * This code is placed in the public domain - */ - -#include "hash-groestl.h" - -/* global variables */ -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; -__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16]; -__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16]; -__attribute__ ((aligned (32))) unsigned char ALL_1B[32]; -__attribute__ ((aligned (32))) unsigned char ALL_FF[32]; - -/* temporary variables */ -__attribute__ ((aligned (32))) unsigned char TEMP[6*32]; - - -#define tos(a) #a -#define tostr(a) tos(a) - -#if (LENGTH <= 256) - -#define SET_CONSTANTS(){\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ - ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ - ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}while(0); - -#define Push_All_Regs() do{\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}while(0); - -#define Pop_All_Regs() do{\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}while(0); - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2(i, j, k, z){\ - asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ - asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ - asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\ - asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2v2(i, j, k, z){\ - asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\ - asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ - asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ - asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ - asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ - asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ - asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ - asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ - asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ - asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ - asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ - \ - /* t_i = a_i + a_{i+1} */\ - asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\ - asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\ - asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\ - asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\ - asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\ - \ - asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\ - asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\ - asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\ - asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\ - \ - /* spill values y_4, y_5 to memory */\ - asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\ - asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\ - asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\ - \ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ - asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\ - \ - /* compute x_i = t_i + t_{i+3} */\ - asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\ - asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\ - asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\ - \ - /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\ - VMUL2(a7, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a0, b0, b1, b2);\ - \ - /* compute w_i : add y_{i+4} */\ - asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\ - asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\ - asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\ - asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\ - asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\ - asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\ - asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\ - asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\ - \ - /*compute v_i: double w_i */\ - VMUL2(a0, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a7, b0, b1, b2);\ - \ - /* add to y_4 y_5 .. v3, v4, ... */\ - asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\ - asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\ - asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\ - asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\ -}/*MixBytes*/ - -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant */\ - asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ - asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ - /* ShiftBytes + SubBytes (interleaved) */\ - asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ - asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ - asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ - asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ - asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ - asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ - asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ - asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ - asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ -} - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\ -\ - asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ -\ - asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ -\ - asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\ - asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\ - asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\ - asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\ - asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\ - asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm12 - xmm15 */ - asm ("vmovaps xmm12, [rdi+0*16]"); - asm ("vmovaps xmm13, [rdi+1*16]"); - asm ("vmovaps xmm14, [rdi+2*16]"); - asm ("vmovaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("vmovaps [rdi+0*16], xmm12"); - asm ("vmovaps [rdi+1*16], xmm2"); - asm ("vmovaps [rdi+2*16], xmm6"); - asm ("vmovaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("vmovaps xmm12, [rsi+0*16]"); - asm ("vmovaps xmm13, [rsi+1*16]"); - asm ("vmovaps xmm14, [rsi+2*16]"); - asm ("vmovaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value and xor message to CV to get input of P */ - /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("vpxor xmm8, xmm12, [rdi+0*16]"); - asm ("vpxor xmm0, xmm2, [rdi+1*16]"); - asm ("vpxor xmm4, xmm6, [rdi+2*16]"); - asm ("vpxor xmm5, xmm7, [rdi+3*16]"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("vpxor xmm0, xmm0, xmm8"); - asm ("vpxor xmm1, xmm1, xmm10"); - asm ("vpxor xmm2, xmm2, xmm12"); - asm ("vpxor xmm3, xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("vpxor xmm0, xmm0, [rdi+0*16]"); - asm ("vpxor xmm1, xmm1, [rdi+1*16]"); - asm ("vpxor xmm2, xmm2, [rdi+2*16]"); - asm ("vpxor xmm3, xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("vmovaps [rdi+0*16], xmm0"); - asm ("vmovaps [rdi+1*16], xmm1"); - asm ("vmovaps [rdi+2*16], xmm2"); - asm ("vmovaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("vmovaps xmm8, [rdi+0*16]"); - asm ("vmovaps xmm10, [rdi+1*16]"); - asm ("vmovaps xmm12, [rdi+2*16]"); - asm ("vmovaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("vpxor xmm8, xmm8, [rdi+0*16]"); - asm ("vpxor xmm10, xmm10, [rdi+1*16]"); - asm ("vpxor xmm12, xmm12, [rdi+2*16]"); - asm ("vpxor xmm14, xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - - /* we only need to return the truncated half of the state */ - asm ("vmovaps [rdi+2*16], xmm9"); - asm ("vmovaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - -#if (LENGTH > 256) - -#define SET_CONSTANTS(){\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ - ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ - ((u64*)ALL_FF)[2] = 0x0000000000000000ULL;\ - ((u64*)ALL_FF)[3] = 0x0000000000000000ULL;\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[2] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[3] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\ - ((u64*)SUBSH_MASK)[10] = 0x000306090c0f0205ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0104070a0d000306ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\ - ((u64*)SUBSH_MASK)[14] = 0x06090c0f0205080bULL;\ - ((u64*)SUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\ - for(i = 0; i < ROUNDS1024; i++)\ - {\ - ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ - ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ - ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - }\ -}while(0); - -#define Push_All_Regs() do{\ - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");\ -}while(0); - -#define Pop_All_Regs() do{\ - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");\ -}while(0); - -/* AVX MUL2 - * ymm[i] will be multiplied by 2 - * ymm[j] will be lost - * ymm[k] has to be all 0x1b - * ymm[z] has to be zero - * clobbers: t2, t3 */ -#define VMUL2(i, j, k, z, ih, jh){\ - asm("vextractf128 xmm"tostr(ih)", ymm"tostr(i)", 1");\ - asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ - asm("vpcmpgtb xmm"tostr(jh)", xmm"tostr(z)", xmm"tostr(ih)"");\ - asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ - asm("vpaddb xmm"tostr(ih)", xmm"tostr(ih)", xmm"tostr(ih)"");\ - asm("vinsertf128 ymm"tostr(j)", ymm"tostr(j)", xmm"tostr(jh)", 1");\ - asm("vinsertf128 ymm"tostr(i)", ymm"tostr(i)", xmm"tostr(ih)", 1");\ - asm("vandpd ymm"tostr(j)", ymm"tostr(j)", ymm"tostr(k)"");\ - asm("vxorpd ymm"tostr(i)", ymm"tostr(i)", ymm"tostr(j)"");\ -}/**/ - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2v2(i, j, k, z){\ - asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ - asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ - asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\ - asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2v3(i, j, k, z){\ - asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\ - asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ - asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* ymm"tostr(8..ymm"tostr(15 = a2 a3... a0 a1 */\ - asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a2)"");\ - asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a3)"");\ - asm("vmovdqa ymm"tostr(b2)", ymm"tostr(a4)"");\ - asm("vmovdqa ymm"tostr(b3)", ymm"tostr(a5)"");\ - asm("vmovdqa ymm"tostr(b4)", ymm"tostr(a6)"");\ - asm("vmovdqa ymm"tostr(b5)", ymm"tostr(a7)"");\ - asm("vmovdqa ymm"tostr(b6)", ymm"tostr(a0)"");\ - asm("vmovdqa ymm"tostr(b7)", ymm"tostr(a1)"");\ - \ - /* t_i = a_i + a_{i+1} */\ - asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a1)"");\ - asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a2)"");\ - asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a3)"");\ - asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a4)"");\ - asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a5)"");\ - asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(a6)"");\ - asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(a7)"");\ - asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b6)"");\ - \ - /* build y4 y5 y6 ... in regs ymm8, ymm9, ymm10 by adding t_i*/\ - asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a4)"");\ - asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a5)"");\ - asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a6)"");\ - asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a7)"");\ - asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a0)"");\ - asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a1)"");\ - asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a2)"");\ - asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a3)"");\ - \ - asm("vxorpd ymm"tostr(b0)", ymm"tostr(b0)", ymm"tostr(a6)"");\ - asm("vxorpd ymm"tostr(b1)", ymm"tostr(b1)", ymm"tostr(a7)"");\ - asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(a0)"");\ - asm("vxorpd ymm"tostr(b3)", ymm"tostr(b3)", ymm"tostr(a1)"");\ - asm("vxorpd ymm"tostr(b4)", ymm"tostr(b4)", ymm"tostr(a2)"");\ - asm("vxorpd ymm"tostr(b5)", ymm"tostr(b5)", ymm"tostr(a3)"");\ - asm("vxorpd ymm"tostr(b6)", ymm"tostr(b6)", ymm"tostr(a4)"");\ - asm("vxorpd ymm"tostr(b7)", ymm"tostr(b7)", ymm"tostr(a5)"");\ - \ - /* spill values y_4, y_5 to memory */\ - asm("vmovaps [TEMP+0*32], ymm"tostr(b0)"");\ - asm("vmovaps [TEMP+1*32], ymm"tostr(b1)"");\ - asm("vmovaps [TEMP+2*32], ymm"tostr(b2)"");\ - asm("vmovaps [TEMP+3*32], ymm"tostr(b3)"");\ - asm("vmovaps [TEMP+4*32], ymm"tostr(b4)"");\ - \ - /* save values t0, t1, t2 to ymm8, ymm9 and memory */\ - asm("vmovdqa ymm"tostr(b0)", ymm"tostr(a0)"");\ - asm("vmovdqa ymm"tostr(b1)", ymm"tostr(a1)"");\ - asm("vmovaps [TEMP+5*32], ymm"tostr(a2)"");\ - \ - /* compute x_i = t_i + t_{i+3} */\ - asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", ymm"tostr(a3)"");\ - asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", ymm"tostr(a4)"");\ - asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", ymm"tostr(a5)"");\ - asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", ymm"tostr(a6)"");\ - asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", ymm"tostr(a7)"");\ - asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b0)"");\ - asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b1)"");\ - asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", [TEMP+5*32]");\ - \ - /*compute z_i : double x_i using temp ymm8 and 1B ymm9 */\ - asm("vmovaps ymm"tostr(b1)", [ALL_1B]");\ - asm("vxorpd ymm"tostr(b2)", ymm"tostr(b2)", ymm"tostr(b2)"");\ - VMUL2(a7, b0, b1, b2, b3, b4);\ - VMUL2(a6, b0, b1, b2, b3, b4);\ - VMUL2(a5, b0, b1, b2, b3, b4);\ - VMUL2(a4, b0, b1, b2, b3, b4);\ - VMUL2(a3, b0, b1, b2, b3, b4);\ - VMUL2(a2, b0, b1, b2, b3, b4);\ - VMUL2(a1, b0, b1, b2, b3, b4);\ - VMUL2(a0, b0, b1, b2, b3, b4);\ - \ - /* compute w_i : add y_{i+4} */\ - asm("vxorpd ymm"tostr(a0)", ymm"tostr(a0)", [TEMP+0*32]");\ - asm("vxorpd ymm"tostr(a1)", ymm"tostr(a1)", [TEMP+1*32]");\ - asm("vxorpd ymm"tostr(a2)", ymm"tostr(a2)", [TEMP+2*32]");\ - asm("vxorpd ymm"tostr(a3)", ymm"tostr(a3)", [TEMP+3*32]");\ - asm("vxorpd ymm"tostr(a4)", ymm"tostr(a4)", [TEMP+4*32]");\ - asm("vxorpd ymm"tostr(a5)", ymm"tostr(a5)", ymm"tostr(b5)"");\ - asm("vxorpd ymm"tostr(a6)", ymm"tostr(a6)", ymm"tostr(b6)"");\ - asm("vxorpd ymm"tostr(a7)", ymm"tostr(a7)", ymm"tostr(b7)"");\ - \ - /*compute v_i: double w_i */\ - VMUL2(a0, b0, b1, b2, b3, b4);\ - VMUL2(a1, b0, b1, b2, b3, b4);\ - VMUL2(a2, b0, b1, b2, b3, b4);\ - VMUL2(a3, b0, b1, b2, b3, b4);\ - VMUL2(a4, b0, b1, b2, b3, b4);\ - VMUL2(a5, b0, b1, b2, b3, b4);\ - VMUL2(a6, b0, b1, b2, b3, b4);\ - VMUL2(a7, b0, b1, b2, b3, b4);\ - \ - /* add to y_4 y_5 .. v3, v4, ... */\ - asm("vxorpd ymm"tostr(b0)", ymm"tostr(a3)", [TEMP+0*32]");\ - asm("vxorpd ymm"tostr(b1)", ymm"tostr(a4)", [TEMP+1*32]");\ - asm("vxorpd ymm"tostr(b2)", ymm"tostr(a5)", [TEMP+2*32]");\ - asm("vxorpd ymm"tostr(b3)", ymm"tostr(a6)", [TEMP+3*32]");\ - asm("vxorpd ymm"tostr(b4)", ymm"tostr(a7)", [TEMP+4*32]");\ - asm("vxorpd ymm"tostr(b5)", ymm"tostr(a0)", ymm"tostr(b5)"");\ - asm("vxorpd ymm"tostr(b6)", ymm"tostr(a1)", ymm"tostr(b6)"");\ - asm("vxorpd ymm"tostr(b7)", ymm"tostr(a2)", ymm"tostr(b7)"");\ -}/*MixBytes*/ - -/* AVX SubShift - * inputs: - * * i - * * c0 (must be 0) - * * ShiftP - * * ShiftQ - * output i = S[Shift(i_1, ShiftQ)|Shift(i_0, ShiftP)] - * clobbers: t0 - * */ -#define SubShift(i, t0, c0, ShiftP, ShiftQ){\ - asm("vextractf128 xmm"tostr(t0)", ymm"tostr(i)", 1");\ - asm("vpshufb xmm"tostr(i)", xmm"tostr(i)", [SUBSH_MASK+"tostr(ShiftP)"*16]");\ - asm("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", [SUBSH_MASK+"tostr(ShiftQ)"*16]");\ - asm("vaesenclast xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(c0)"");\ - asm("vaesenclast xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(c0)"");\ - asm("vinsertf128 ymm"tostr(i)", ymm"tostr(i)", xmm"tostr(t0)", 1");\ -}/**/ - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* ShiftBytes + SubBytes */\ - asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\ - SubShift(a0, b1, b0, 0, 1);\ - SubShift(a1, b1, b0, 1, 3);\ - SubShift(a2, b1, b0, 2, 5);\ - SubShift(a3, b1, b0, 3, 7);\ - SubShift(a4, b1, b0, 4, 0);\ - SubShift(a5, b1, b0, 5, 2);\ - SubShift(a6, b1, b0, 6, 4);\ - SubShift(a7, b1, b0, 7, 6);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -#define ROUNDS_P_Q(){\ - asm ("xor rax, rax");\ - asm ("1:");\ - /* AddRoundConstant */\ - asm ("vxorpd ymm6, ymm6, ymm6");\ - asm ("vinsertf128 ymm7, ymm6, [ROUND_CONST_Q+eax*8], 1");\ - asm ("vinsertf128 ymm6, ymm6, [ALL_FF], 1");\ - asm ("vinsertf128 ymm0, ymm6, [ROUND_CONST_P+eax*8], 0");\ - asm ("vxorpd ymm0, ymm8, ymm0");\ - asm ("vxorpd ymm1, ymm9, ymm6");\ - asm ("vxorpd ymm2, ymm10, ymm6");\ - asm ("vxorpd ymm3, ymm11, ymm6");\ - asm ("vxorpd ymm4, ymm12, ymm6");\ - asm ("vxorpd ymm5, ymm13, ymm6");\ - asm ("vxorpd ymm6, ymm14, ymm6");\ - asm ("vxorpd ymm7, ymm15, ymm7");\ - /* SubBytes + ShiftBytes + MixBytes */\ - SUBSHIFTMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - asm ("add al, 2");\ - asm ("mov rbx, rax");\ - asm ("sub bl, 28");\ - asm ("jb 1b");\ -} - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i5)", xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i7)", xmm"tostr(i7)", xmm"tostr(t0)"");\ -\ - /* continue with unpack */\ - asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpckhwd xmm"tostr(t1)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpckhwd xmm"tostr(t2)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpcklwd xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ -\ - /* shuffle with immediate */\ - asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("vpshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ - asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("vpshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ -\ - /* continue with unpack */\ - asm ("vpunpckhdq xmm"tostr(t4)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpckhdq xmm"tostr(t5)", xmm"tostr(t0)", xmm"tostr(t1)"");\ - asm ("vpunpckldq xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t1)"");\ - asm ("vpunpckhdq xmm"tostr(t6)", xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("vpunpckldq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("vpunpckhdq xmm"tostr(t7)", xmm"tostr(t2)", xmm"tostr(t3)"");\ - asm ("vpunpckldq xmm"tostr(t2)", xmm"tostr(t2)", xmm"tostr(t3)"");\ -\ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(t0)", xmm"tostr(t2)"");\ - asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(t0)", xmm"tostr(t2)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(t4)", xmm"tostr(t6)"");\ - asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(t4)", xmm"tostr(t6)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(t5)", xmm"tostr(t7)"");\ - asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(t5)", xmm"tostr(t7)"");\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - asm ("vmovaps xmm"tostr(o0)", [TRANSP_MASK]");\ - /* transpose matrix to get output format */\ - asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpckhqdq xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpckhqdq xmm"tostr(t1)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpckhqdq xmm"tostr(t2)", xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ - /* load transpose mask into a register, because it will be used 8 times */\ - asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(o0)"");\ - asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(o0)"");\ - asm ("vpshufb xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(o0)"");\ - asm ("vpshufb xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(o0)"");\ - asm ("vpshufb xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(o0)"");\ - asm ("vpshufb xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(o0)"");\ - asm ("vpshufb xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(o0)"");\ - asm ("vpshufb xmm"tostr(t2)", xmm"tostr(t2)", xmm"tostr(o0)"");\ - /* continue with unpack */\ - asm ("vpunpckhwd xmm"tostr(t3)", xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("vpunpcklwd xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("vpunpckhwd xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpckhwd xmm"tostr(o2)", xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("vpunpcklwd xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("vpunpckhwd xmm"tostr(t4)", xmm"tostr(t1)", xmm"tostr(t2)"");\ - asm ("vpunpcklwd xmm"tostr(t1)", xmm"tostr(t1)", xmm"tostr(t2)"");\ - /* shuffle with immediate */\ - asm ("vpshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("vpshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("vpshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ - asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("vpshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ - asm ("vpshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("vpshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ - /* continue with unpack */\ - asm ("vpunpckhdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpckhdq xmm"tostr(i3)", xmm"tostr(o0)", xmm"tostr(t3)"");\ - asm ("vpunpckldq xmm"tostr(o0)", xmm"tostr(o0)", xmm"tostr(t3)"");\ - asm ("vpunpckhdq xmm"tostr(i5)", xmm"tostr(o1)", xmm"tostr(t1)"");\ - asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t1)"");\ - asm ("vpunpckhdq xmm"tostr(i7)", xmm"tostr(o2)", xmm"tostr(t4)"");\ - asm ("vpunpckldq xmm"tostr(o2)", xmm"tostr(o2)", xmm"tostr(t4)"");\ - /* transpose done */\ -}/**/ - - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm8 - xmm15 */ - asm ("vmovaps xmm8, [rdi+0*16]"); - asm ("vmovaps xmm9, [rdi+1*16]"); - asm ("vmovaps xmm10, [rdi+2*16]"); - asm ("vmovaps xmm11, [rdi+3*16]"); - asm ("vmovaps xmm12, [rdi+4*16]"); - asm ("vmovaps xmm13, [rdi+5*16]"); - asm ("vmovaps xmm14, [rdi+6*16]"); - asm ("vmovaps xmm15, [rdi+7*16]"); - - /* transform chaining value from column ordering into row ordering */ - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store transposed IV */ - asm ("vmovaps [rdi+0*16], xmm8"); - asm ("vmovaps [rdi+1*16], xmm9"); - asm ("vmovaps [rdi+2*16], xmm10"); - asm ("vmovaps [rdi+3*16], xmm11"); - asm ("vmovaps [rdi+4*16], xmm12"); - asm ("vmovaps [rdi+5*16], xmm13"); - asm ("vmovaps [rdi+6*16], xmm14"); - asm ("vmovaps [rdi+7*16], xmm15"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF1024(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm8...xmm15 (Q = message) */ - asm ("vmovaps xmm0, [rsi+0*16]"); - asm ("vmovaps xmm1, [rsi+1*16]"); - asm ("vmovaps xmm2, [rsi+2*16]"); - asm ("vmovaps xmm3, [rsi+3*16]"); - asm ("vmovaps xmm4, [rsi+4*16]"); - asm ("vmovaps xmm5, [rsi+5*16]"); - asm ("vmovaps xmm6, [rsi+6*16]"); - asm ("vmovaps xmm7, [rsi+7*16]"); - - /* transform message M from column ordering into row ordering */ - Matrix_Transpose(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - - /* load previous chaining value and xor message to CV to get input of P */ - /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */ - /* result: CV+M in xmm8...xmm15 */ - asm ("vpxor xmm8, xmm0, [rdi+0*16]"); - asm ("vpxor xmm9, xmm1, [rdi+1*16]"); - asm ("vpxor xmm10, xmm2, [rdi+2*16]"); - asm ("vpxor xmm11, xmm3, [rdi+3*16]"); - asm ("vpxor xmm12, xmm4, [rdi+4*16]"); - asm ("vpxor xmm13, xmm5, [rdi+5*16]"); - asm ("vpxor xmm14, xmm6, [rdi+6*16]"); - asm ("vpxor xmm15, xmm7, [rdi+7*16]"); - - /* generate AVX registers with Q in high and P in low 128 bits */ - asm ("vinsertf128 ymm8, ymm8, xmm0, 1"); - asm ("vinsertf128 ymm9, ymm9, xmm1, 1"); - asm ("vinsertf128 ymm10, ymm10, xmm2, 1"); - asm ("vinsertf128 ymm11, ymm11, xmm3, 1"); - asm ("vinsertf128 ymm12, ymm12, xmm4, 1"); - asm ("vinsertf128 ymm13, ymm13, xmm5, 1"); - asm ("vinsertf128 ymm14, ymm14, xmm6, 1"); - asm ("vinsertf128 ymm15, ymm15, xmm7, 1"); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* extract output of Q to xmm0...xmm7 */ - asm ("vextractf128 xmm0, ymm8, 1"); - asm ("vextractf128 xmm1, ymm9, 1"); - asm ("vextractf128 xmm2, ymm10, 1"); - asm ("vextractf128 xmm3, ymm11, 1"); - asm ("vextractf128 xmm4, ymm12, 1"); - asm ("vextractf128 xmm5, ymm13, 1"); - asm ("vextractf128 xmm6, ymm14, 1"); - asm ("vextractf128 xmm7, ymm15, 1"); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm8...xmm15 */ - asm ("vpxor xmm8, xmm8, xmm0"); - asm ("vpxor xmm9, xmm9, xmm1"); - asm ("vpxor xmm10, xmm10, xmm2"); - asm ("vpxor xmm11, xmm11, xmm3"); - asm ("vpxor xmm12, xmm12, xmm4"); - asm ("vpxor xmm13, xmm13, xmm5"); - asm ("vpxor xmm14, xmm14, xmm6"); - asm ("vpxor xmm15, xmm15, xmm7"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */ - asm ("vpxor xmm8, xmm8, [rdi+0*16]"); - asm ("vpxor xmm9, xmm9, [rdi+1*16]"); - asm ("vpxor xmm10, xmm10, [rdi+2*16]"); - asm ("vpxor xmm11, xmm11, [rdi+3*16]"); - asm ("vpxor xmm12, xmm12, [rdi+4*16]"); - asm ("vpxor xmm13, xmm13, [rdi+5*16]"); - asm ("vpxor xmm14, xmm14, [rdi+6*16]"); - asm ("vpxor xmm15, xmm15, [rdi+7*16]"); - - /* store CV */ - asm ("vmovaps [rdi+0*16], xmm8"); - asm ("vmovaps [rdi+1*16], xmm9"); - asm ("vmovaps [rdi+2*16], xmm10"); - asm ("vmovaps [rdi+3*16], xmm11"); - asm ("vmovaps [rdi+4*16], xmm12"); - asm ("vmovaps [rdi+5*16], xmm13"); - asm ("vmovaps [rdi+6*16], xmm14"); - asm ("vmovaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF1024(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - asm ("vpxor xmm0, xmm0, xmm0"); - - /* load CV into registers xmm8...xmm15 */ - asm ("vmovaps xmm8, [rdi+0*16]"); - asm ("vmovaps xmm9, [rdi+1*16]"); - asm ("vmovaps xmm10, [rdi+2*16]"); - asm ("vmovaps xmm11, [rdi+3*16]"); - asm ("vmovaps xmm12, [rdi+4*16]"); - asm ("vmovaps xmm13, [rdi+5*16]"); - asm ("vmovaps xmm14, [rdi+6*16]"); - asm ("vmovaps xmm15, [rdi+7*16]"); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8...xmm15 */ - ROUNDS_P_Q(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - asm ("vpxor xmm8, xmm8, [rdi+0*16]"); - asm ("vpxor xmm9, xmm9, [rdi+1*16]"); - asm ("vpxor xmm10, xmm10, [rdi+2*16]"); - asm ("vpxor xmm11, xmm11, [rdi+3*16]"); - asm ("vpxor xmm12, xmm12, [rdi+4*16]"); - asm ("vpxor xmm13, xmm13, [rdi+5*16]"); - asm ("vpxor xmm14, xmm14, [rdi+6*16]"); - asm ("vpxor xmm15, xmm15, [rdi+7*16]"); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); - - /* we only need to return the truncated half of the state */ - asm ("vmovaps [rdi+4*16], xmm0"); - asm ("vmovaps [rdi+5*16], xmm6"); - asm ("vmovaps [rdi+6*16], xmm13"); - asm ("vmovaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - diff --git a/algo/groestl/aes_ni/groestl-asm-vperm.h b/algo/groestl/aes_ni/groestl-asm-vperm.h deleted file mode 100644 index f8ae27c..0000000 --- a/algo/groestl/aes_ni/groestl-asm-vperm.h +++ /dev/null @@ -1,1397 +0,0 @@ -/* groestl-asm-vperm.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3 instructions. - * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * Based on the vperm and aes_ni implementations of the hash function Groestl - * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ - * Institute of Applied Mathematics, Middle East Technical University, Turkey - * - * This code is placed in the public domain - */ - -#include "hash-groestl.h" - -/* global constants */ -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; -__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; -__attribute__ ((aligned (16))) unsigned char ALL_0F[16]; -__attribute__ ((aligned (16))) unsigned char ALL_15[16]; -__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; -__attribute__ ((aligned (16))) unsigned char ALL_63[16]; -__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; -__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16]; - -/* temporary variables */ -__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16]; -__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP[8*16]; - - -#define tos(a) #a -#define tostr(a) tos(a) - -#define SET_SHARED_CONSTANTS(){\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\ - ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\ - ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\ - ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\ - ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\ - ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\ - ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\ - ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\ - ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\ - ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\ - ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\ - ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\ - ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\ - ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\ - ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\ - ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\ - ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\ - ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\ - ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\ - ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\ - ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\ - ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\ - ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\ - ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\ - ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\ - ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\ - ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\ - ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\ -/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\ - ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\ - ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\ - ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\ - ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\ - ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\ -}/**/ - -/* VPERM - * Transform w/o settings c* - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\ - asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ - asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\ - asm ("psrld xmm"tostr(t0)", 4");\ - asm ("psrld xmm"tostr(t1)", 4");\ - asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ - asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\ - asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\ - asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\ - asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\ - asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\ -}/**/ - -#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ - asm ("movaps xmm"tostr(c0)", [ALL_0F]");\ - asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\ - asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\ -}/**/ - -/* VPERM - * Transform - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Transform State - * inputs: - * a0-a3 = state - * table = transformation table to use - * t* = clobbers - * outputs: - * a0-a3 = transformed state - * */ -#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ - VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Add Constant to State - * inputs: - * a0-a7 = state - * constant = constant to add - * t0 = clobber - * outputs: - * a0-a7 = state + constant - * */ -#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ - asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * Set Substitute Core Constants - * */ -#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ - VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ -}/**/ - -/* VPERM - * Substitute Core - * first part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0 = 1 row - * t*, c* = clobbers - * outputs: - * b0a, b0b = inputs for lookup step - * */ -#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ - asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ - asm ("psrld xmm"tostr(t0)", 4");\ - asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\ - asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\ - asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\ - asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\ - asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\ - asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * Lookup - * second part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0a, a0b = output of Substitution Core - * table = lookup table to use (*1 / *2 / *4) - * t0 = clobber - * outputs: - * b0 = output of sbox + multiplication - * */ -#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ - asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\ - asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\ - asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\ - asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * SubBytes and *2 / *4 - * this function is derived from: - * Constant-time SSSE3 AES core implementation - * by Mike Hamburg - * and - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0-a7 = state - * t*, c* = clobbers - * outputs: - * a0-a7 = state * 4 - * c2 = row0 * 2 -> b0 - * c1 = row7 * 2 -> b3 - * c0 = row7 * 1 -> b4 - * t2 = row4 * 1 -> b7 - * TEMP_MUL1 = row(i) * 1 - * TEMP_MUL2 = row(i) * 2 - * - * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ -#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ - /* set Constants */\ - VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ - /* row 1 */\ - VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ - /* --- */\ - /* row 2 */\ - VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ - /* --- */\ - /* row 3 */\ - VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ - /* --- */\ - /* row 5 */\ - VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ - /* --- */\ - /* row 6 */\ - VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ - /* --- */\ - /* row 7 */\ - VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ - VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ - /* --- */\ - /* row 4 */\ - VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ - /* --- */\ - /* row 0 */\ - VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ - asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ - /* --- */\ -}/**/ - - -/* Optimized MixBytes - * inputs: - * a0-a7 = (row0-row7) * 4 - * b0 = row0 * 2 - * b3 = row7 * 2 - * b4 = row7 * 1 - * b7 = row4 * 1 - * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 - * output: b0-b7 - * */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* save one value */\ - asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\ - /* 1 */\ - asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\ - asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\ - \ - /* 2 */\ - asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\ - asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\ - \ - /* 4 */\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\ - /*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\ - asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\ - \ - /* 3 */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\ - /*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\ - asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\ - \ - /* 5 */\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ - /*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\ - \ - /* 6 */\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ - \ - /* 7 */\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ - \ - /* 8 */\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ - \ - /* 9 */\ - asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* 10 */\ - asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\ - \ - /* 11 */\ - asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ - \ - /* 12 */\ - asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* 13 */\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ -}/**/ - -#if (LENGTH <= 256) - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\ - ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\ - ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}/**/ - -#define Push_All_Regs(){\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}/**/ - -#define Pop_All_Regs(){\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}/**/ - - -/* vperm: - * transformation before rounds with ipt - * first round add transformed constant - * middle rounds: add constant XOR 0x15...15 - * last round: additionally add 0x15...15 after MB - * transformation after rounds with opt - */ -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant + ShiftBytes (interleaved) */\ - asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ - asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ - asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ - asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ - /* SubBytes + Multiplication by 2 and 4 */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ - ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ -} - - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ -\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ -\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ -\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ -\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ -\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ - asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\ - asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\ - asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\ - VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("pxor xmm1, [ALL_15]");\ - asm ("pxor xmm2, [ALL_15]");\ - asm ("pxor xmm3, [ALL_15]");\ - asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\ - asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\ - asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\ - asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - asm ("movaps xmm0, [ROUND_CONST_Lx]");\ - VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("movaps [ROUND_CONST_Lx], xmm0");\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ -}/**/ - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm12 - xmm15 */ - asm ("movaps xmm12, [rdi+0*16]"); - asm ("movaps xmm13, [rdi+1*16]"); - asm ("movaps xmm14, [rdi+2*16]"); - asm ("movaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm12"); - asm ("movaps [rdi+1*16], xmm2"); - asm ("movaps [rdi+2*16], xmm6"); - asm ("movaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("movaps xmm12, [rsi+0*16]"); - asm ("movaps xmm13, [rsi+1*16]"); - asm ("movaps xmm14, [rsi+2*16]"); - asm ("movaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm0, [rdi+1*16]"); - asm ("movaps xmm4, [rdi+2*16]"); - asm ("movaps xmm5, [rdi+3*16]"); - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("pxor xmm8, xmm12"); - asm ("pxor xmm0, xmm2"); - asm ("pxor xmm4, xmm6"); - asm ("pxor xmm5, xmm7"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("pxor xmm0, xmm8"); - asm ("pxor xmm1, xmm10"); - asm ("pxor xmm2, xmm12"); - asm ("pxor xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("pxor xmm0, [rdi+0*16]"); - asm ("pxor xmm1, [rdi+1*16]"); - asm ("pxor xmm2, [rdi+2*16]"); - asm ("pxor xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm0"); - asm ("movaps [rdi+1*16], xmm1"); - asm ("movaps [rdi+2*16], xmm2"); - asm ("movaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm10, [rdi+1*16]"); - asm ("movaps xmm12, [rdi+2*16]"); - asm ("movaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm10, [rdi+1*16]"); - asm ("pxor xmm12, [rdi+2*16]"); - asm ("pxor xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+2*16], xmm9"); - asm ("movaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - -#if (LENGTH > 256) - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ - ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x0f0e0d0c0b0a0908ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0807060504030201ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x000f0e0d0c0b0a09ULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x0908070605040302ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x01000f0e0d0c0b0aULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0a09080706050403ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0201000f0e0d0c0bULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0b0a090807060504ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x030201000f0e0d0cULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0c0b0a0908070605ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x04030201000f0e0dULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0d0c0b0a09080706ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x0504030201000f0eULL;\ - ((u64*)SUBSH_MASK)[14] = 0x0201000f0e0d0c0bULL;\ - ((u64*)SUBSH_MASK)[15] = 0x0a09080706050403ULL;\ - for(i = 0; i < ROUNDS1024; i++)\ - {\ - ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ - ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ - ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - }\ -}/**/ - -#define Push_All_Regs(){\ - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");\ -}/**/ - -#define Pop_All_Regs(){\ - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");\ -}/**/ - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* SubBytes + Multiplication */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -#define ROUNDS_P(){\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("1:");\ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm9, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - VPERM_Add_Constant(0, 1, 2, 3, 4, 5, 6, 7, ALL_15, 8);\ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 1b");\ -}/**/ - -#define ROUNDS_Q(){\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("2:");\ - /* AddRoundConstant Q1024 */\ - asm ("movaps xmm1, [ALL_FF]");\ - asm ("pxor xmm8, xmm1");\ - asm ("pxor xmm9, xmm1");\ - asm ("pxor xmm10, xmm1");\ - asm ("pxor xmm11, xmm1");\ - asm ("pxor xmm12, xmm1");\ - asm ("pxor xmm13, xmm1");\ - asm ("pxor xmm14, xmm1");\ - asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm9, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\ - asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - /* AddRoundConstant Q1024 */\ - asm ("movaps xmm9, [ALL_FF]");\ - asm ("pxor xmm0, xmm9");\ - asm ("pxor xmm1, xmm9");\ - asm ("pxor xmm2, xmm9");\ - asm ("pxor xmm3, xmm9");\ - asm ("pxor xmm4, xmm9");\ - asm ("pxor xmm5, xmm9");\ - asm ("pxor xmm6, xmm9");\ - asm ("pxor xmm7, [ROUND_CONST_Q+ebx*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\ - asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 2b");\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\ -}/**/ - - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\ -\ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\ - asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\ - asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ -\ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ -\ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(t4)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t0)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(t5)", xmm"tostr(t1)"");\ - asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\ - asm ("punpckhdq xmm"tostr(t6)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t2)", xmm"tostr(t3)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(t7)", xmm"tostr(t3)"");\ -\ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - /* transpose matrix to get output format */\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\ - asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\ - /* load transpose mask into a register, because it will be used 8 times */\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\ - \ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\ - asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\ - asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\ - asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\ - asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\ - asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\ - /* transpose done */\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - asm ("movaps xmm0, [ROUND_CONST_P+"tostr(i)"*16]");\ - asm ("movaps xmm1, [ROUND_CONST_P+"tostr(j)"*16]");\ - asm ("movaps xmm2, [ROUND_CONST_Q+"tostr(i)"*16]");\ - asm ("movaps xmm3, [ROUND_CONST_Q+"tostr(j)"*16]");\ - VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm2, [ALL_15]");\ - asm ("pxor xmm3, [ALL_15]");\ - asm ("movaps [ROUND_CONST_P+"tostr(i)"*16], xmm0");\ - asm ("movaps [ROUND_CONST_P+"tostr(j)"*16], xmm1");\ - asm ("movaps [ROUND_CONST_Q+"tostr(i)"*16], xmm2");\ - asm ("movaps [ROUND_CONST_Q+"tostr(j)"*16], xmm3");\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ - VPERM_Transform_RoundConst_CNT2(10, 11);\ - VPERM_Transform_RoundConst_CNT2(12, 13);\ - asm ("movaps xmm0, [ALL_FF]");\ - VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("movaps [ALL_FF], xmm0");\ -}/**/ - - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* transform chaining value from column ordering into row ordering */ - VPERM_Transform_State( 8, 9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF1024(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm8 - xmm15 (Q = message) */ - asm ("movaps xmm8, [rsi+0*16]"); - asm ("movaps xmm9, [rsi+1*16]"); - asm ("movaps xmm10, [rsi+2*16]"); - asm ("movaps xmm11, [rsi+3*16]"); - asm ("movaps xmm12, [rsi+4*16]"); - asm ("movaps xmm13, [rsi+5*16]"); - asm ("movaps xmm14, [rsi+6*16]"); - asm ("movaps xmm15, [rsi+7*16]"); - - /* transform message M from column ordering into row ordering */ - VPERM_Transform_State( 8, 9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store message M (Q input) for later */ - asm ("movaps [QTEMP+0*16], xmm8"); - asm ("movaps [QTEMP+1*16], xmm9"); - asm ("movaps [QTEMP+2*16], xmm10"); - asm ("movaps [QTEMP+3*16], xmm11"); - asm ("movaps [QTEMP+4*16], xmm12"); - asm ("movaps [QTEMP+5*16], xmm13"); - asm ("movaps [QTEMP+6*16], xmm14"); - asm ("movaps [QTEMP+7*16], xmm15"); - - /* xor CV to message to get P input */ - /* result: CV+M in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV+M) in xmm8...xmm15 */ - ROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV+M)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store P(CV+M)+CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - /* load message M (Q input) into xmm8-15 */ - asm ("movaps xmm8, [QTEMP+0*16]"); - asm ("movaps xmm9, [QTEMP+1*16]"); - asm ("movaps xmm10, [QTEMP+2*16]"); - asm ("movaps xmm11, [QTEMP+3*16]"); - asm ("movaps xmm12, [QTEMP+4*16]"); - asm ("movaps xmm13, [QTEMP+5*16]"); - asm ("movaps xmm14, [QTEMP+6*16]"); - asm ("movaps xmm15, [QTEMP+7*16]"); - - /* compute permutation Q */ - /* result: Q(M) in xmm8...xmm15 */ - ROUNDS_Q(); - - /* xor Q output */ - /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF1024(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV) in xmm8...xmm15 */ - ROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); - VPERM_Transform_State( 0, 6, 13, 15, VPERM_OPT, 1, 2, 3, 5, 7, 10, 12); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+4*16], xmm0"); - asm ("movaps [rdi+5*16], xmm6"); - asm ("movaps [rdi+6*16], xmm13"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h index 10092da..3c3e740 100644 --- a/algo/groestl/aes_ni/groestl-intr-aes.h +++ b/algo/groestl/aes_ni/groestl-intr-aes.h @@ -11,17 +11,6 @@ #include #include "hash-groestl.h" -/* global constants */ -__m128i ROUND_CONST_Lx; -//__m128i ROUND_CONST_L0[ROUNDS512]; -//__m128i ROUND_CONST_L7[ROUNDS512]; -__m128i ROUND_CONST_P[ROUNDS1024]; -__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_1B; -__m128i ALL_FF; - #define tos(a) #a #define tostr(a) tos(a) @@ -111,7 +100,7 @@ __m128i ALL_FF; \ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute w_i : add y_{i+4} */\ - b1 = ALL_1B;\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ MUL2(a0, b0, b1);\ a0 = _mm_xor_si128(a0, TEMP0);\ MUL2(a1, b0, b1);\ @@ -152,24 +141,41 @@ __m128i ALL_FF; }/*MixBytes*/ -#define SET_CONSTANTS(){\ - ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\ - SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\ - SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\ - SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\ - SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\ - for(i = 0; i < ROUNDS1024; i++)\ - {\ - ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ - }\ -}while(0);\ +static const uint64_t round_const_p[] __attribute__ ((aligned (64))) = +{ + 0x7060504030201000, 0xf0e0d0c0b0a09080, + 0x7161514131211101, 0xf1e1d1c1b1a19181, + 0x7262524232221202, 0xf2e2d2c2b2a29282, + 0x7363534333231303, 0xf3e3d3c3b3a39383, + 0x7464544434241404, 0xf4e4d4c4b4a49484, + 0x7565554535251505, 0xf5e5d5c5b5a59585, + 0x7666564636261606, 0xf6e6d6c6b6a69686, + 0x7767574737271707, 0xf7e7d7c7b7a79787, + 0x7868584838281808, 0xf8e8d8c8b8a89888, + 0x7969594939291909, 0xf9e9d9c9b9a99989, + 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a, + 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b, + 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c, + 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d +}; + +static const uint64_t round_const_q[] __attribute__ ((aligned (64))) = +{ + 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f, + 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e, + 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d, + 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c, + 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b, + 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a, + 0x8999a9b9c9d9e9f9, 0x0919293949596979, + 0x8898a8b8c8d8e8f8, 0x0818283848586878, + 0x8797a7b7c7d7e7f7, 0x0717273747576777, + 0x8696a6b6c6d6e6f6, 0x0616263646566676, + 0x8595a5b5c5d5e5f5, 0x0515253545556575, + 0x8494a4b4c4d4e4f4, 0x0414243444546474, + 0x8393a3b3c3d3e3f3, 0x0313233343536373, + 0x8292a2b2c2d2e2f2, 0x0212223242526272 +}; /* one round * a0-a7 = input rows @@ -194,30 +200,50 @@ __m128i ALL_FF; u8 round_counter = 0;\ for(round_counter = 0; round_counter < 14; round_counter+=2) {\ /* AddRoundConstant P1024 */\ - xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\ + xmm8 = _mm_xor_si128( xmm8, \ + casti_m128i( round_const_p, round_counter ) ); \ /* ShiftBytes P1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\ + xmm8 = _mm_shuffle_epi8( xmm8, m128_const_64( 0x0306090c0f020508, \ + 0x0b0e0104070a0d00 ) ); \ + xmm9 = _mm_shuffle_epi8( xmm9, m128_const_64( 0x04070a0d00030609, \ + 0x0c0f0205080b0e01 ) ); \ + xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x05080b0e0104070a, \ + 0x0d000306090c0f02 ) ); \ + xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x06090c0f0205080b, \ + 0x0e0104070a0d0003 ) ); \ + xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x070a0d000306090c, \ + 0x0f0205080b0e0104 ) ); \ + xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x080b0e0104070a0d, \ + 0x000306090c0f0205 ) ); \ + xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x090c0f0205080b0e, \ + 0x0104070a0d000306 ) ); \ + xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x0e0104070a0d0003, \ + 0x06090c0f0205080b ) ); \ /* SubBytes + MixBytes */\ - SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \ + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \ \ /* AddRoundConstant P1024 */\ - xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\ - xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\ - SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + xmm0 = _mm_xor_si128( xmm0, \ + casti_m128i( round_const_p, round_counter+1 ) ); \ + xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x0306090c0f020508, \ + 0x0b0e0104070a0d00 ) ); \ + xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x04070a0d00030609, \ + 0x0c0f0205080b0e01 ) ); \ + xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x05080b0e0104070a, \ + 0x0d000306090c0f02 ) ); \ + xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x06090c0f0205080b, \ + 0x0e0104070a0d0003 ) ); \ + xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x070a0d000306090c, \ + 0x0f0205080b0e0104 ) ); \ + xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x080b0e0104070a0d, \ + 0x000306090c0f0205 ) ); \ + xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x090c0f0205080b0e, \ + 0x0104070a0d000306 ) ); \ + xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x0e0104070a0d0003, \ + 0x06090c0f0205080b ) ); \ + SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \ + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \ }\ } @@ -225,48 +251,68 @@ __m128i ALL_FF; u8 round_counter = 0;\ for(round_counter = 0; round_counter < 14; round_counter+=2) {\ /* AddRoundConstant Q1024 */\ - xmm1 = ALL_FF;\ - xmm8 = _mm_xor_si128(xmm8, xmm1);\ - xmm9 = _mm_xor_si128(xmm9, xmm1);\ - xmm10 = _mm_xor_si128(xmm10, xmm1);\ - xmm11 = _mm_xor_si128(xmm11, xmm1);\ - xmm12 = _mm_xor_si128(xmm12, xmm1);\ - xmm13 = _mm_xor_si128(xmm13, xmm1);\ - xmm14 = _mm_xor_si128(xmm14, xmm1);\ - xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\ + xmm1 = m128_neg1;\ + xmm8 = _mm_xor_si128( xmm8, xmm1 ); \ + xmm9 = _mm_xor_si128( xmm9, xmm1 ); \ + xmm10 = _mm_xor_si128( xmm10, xmm1 ); \ + xmm11 = _mm_xor_si128( xmm11, xmm1 ); \ + xmm12 = _mm_xor_si128( xmm12, xmm1 ); \ + xmm13 = _mm_xor_si128( xmm13, xmm1 ); \ + xmm14 = _mm_xor_si128( xmm14, xmm1 ); \ + xmm15 = _mm_xor_si128( xmm15, \ + casti_m128i( round_const_q, round_counter ) ); \ /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\ + xmm8 = _mm_shuffle_epi8( xmm8, m128_const_64( 0x04070a0d00030609, \ + 0x0c0f0205080b0e01 ) ); \ + xmm9 = _mm_shuffle_epi8( xmm9, m128_const_64( 0x06090c0f0205080b, \ + 0x0e0104070a0d0003 ) ); \ + xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x080b0e0104070a0d, \ + 0x000306090c0f0205 ) ); \ + xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x0e0104070a0d0003, \ + 0x06090c0f0205080b ) ); \ + xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x0306090c0f020508, \ + 0x0b0e0104070a0d00 ) ); \ + xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x05080b0e0104070a, \ + 0x0d000306090c0f02 ) ); \ + xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x070a0d000306090c, \ + 0x0f0205080b0e0104 ) ); \ + xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x090c0f0205080b0e, \ + 0x0104070a0d000306 ) ); \ /* SubBytes + MixBytes */\ - SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ + SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \ + xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 , xmm7 ); \ \ /* AddRoundConstant Q1024 */\ - xmm9 = ALL_FF;\ - xmm0 = _mm_xor_si128(xmm0, xmm9);\ - xmm1 = _mm_xor_si128(xmm1, xmm9);\ - xmm2 = _mm_xor_si128(xmm2, xmm9);\ - xmm3 = _mm_xor_si128(xmm3, xmm9);\ - xmm4 = _mm_xor_si128(xmm4, xmm9);\ - xmm5 = _mm_xor_si128(xmm5, xmm9);\ - xmm6 = _mm_xor_si128(xmm6, xmm9);\ - xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\ + xmm9 = m128_neg1;\ + xmm0 = _mm_xor_si128( xmm0, xmm9 ); \ + xmm1 = _mm_xor_si128( xmm1, xmm9 ); \ + xmm2 = _mm_xor_si128( xmm2, xmm9 ); \ + xmm3 = _mm_xor_si128( xmm3, xmm9 ); \ + xmm4 = _mm_xor_si128( xmm4, xmm9 ); \ + xmm5 = _mm_xor_si128( xmm5, xmm9 ); \ + xmm6 = _mm_xor_si128( xmm6, xmm9 ); \ + xmm7 = _mm_xor_si128( xmm7, \ + casti_m128i( round_const_q, round_counter+1 ) ); \ /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\ + xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x04070a0d00030609, \ + 0x0c0f0205080b0e01 ) ); \ + xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x06090c0f0205080b, \ + 0x0e0104070a0d0003 ) ); \ + xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x080b0e0104070a0d, \ + 0x000306090c0f0205 ) ); \ + xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x0e0104070a0d0003, \ + 0x06090c0f0205080b ) ); \ + xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x0306090c0f020508, \ + 0x0b0e0104070a0d00 ) ); \ + xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x05080b0e0104070a, \ + 0x0d000306090c0f02 ) ); \ + xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x070a0d000306090c, \ + 0x0f0205080b0e0104 ) ); \ + xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x090c0f0205080b0e, \ + 0x0104070a0d000306 ) ); \ /* SubBytes + MixBytes */\ - SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ + SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \ + xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \ }\ } @@ -278,7 +324,7 @@ __m128i ALL_FF; * clobbers: t0-t7 */ #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - t0 = TRANSP_MASK;\ + t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 );\ \ i6 = _mm_shuffle_epi8(i6, t0);\ i0 = _mm_shuffle_epi8(i0, t0);\ @@ -366,7 +412,7 @@ __m128i ALL_FF; i4 = _mm_unpacklo_epi64(i4, i5);\ t1 = _mm_unpackhi_epi64(t1, i5);\ t2 = i6;\ - o0 = TRANSP_MASK;\ + o0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \ i6 = _mm_unpacklo_epi64(i6, i7);\ t2 = _mm_unpackhi_epi64(t2, i7);\ /* load transpose mask into a register, because it will be used 8 times */\ diff --git a/algo/groestl/aes_ni/groestl-intr-avx.h b/algo/groestl/aes_ni/groestl-intr-avx.h deleted file mode 100644 index 97f08dd..0000000 --- a/algo/groestl/aes_ni/groestl-intr-avx.h +++ /dev/null @@ -1,1072 +0,0 @@ -/* groestl-intr-avx.h Aug 2011 - * - * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx - * instructions. - * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * This code is placed in the public domain - */ - -#include -#include -#include -#include "hash-groestl.h" - -/* global constants */ -__m128i ROUND_CONST_Lx; -__m128i ROUND_CONST_L0[ROUNDS512]; -__m128i ROUND_CONST_L7[ROUNDS512]; -__m128i ROUND_CONST_P[ROUNDS1024]; -__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_FF; -#if LENGTH <= 256 -__m128i ALL_1B; -#else -__m256d ALL_1B; -#endif - -#define tos(a) #a -#define tostr(a) tos(a) - -#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos))) -#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos)) - -#if (LENGTH <= 256) - -#define SET_CONSTANTS(){\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ - SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ - SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ - SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ - SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ - }\ - ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ -}while(0); - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2(i, j, k, z){\ - j = _mm_cmpgt_epi8(z, i);\ - i = _mm_add_epi8(i, i);\ - j = _mm_and_si128(j, k);\ - i = _mm_xor_si128(i, j);\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ - b0 = a2;\ - b1 = a3;\ - b2 = a4;\ - b3 = a5;\ - b4 = a6;\ - b5 = a7;\ - b6 = a0;\ - b7 = a1;\ - \ - /* t_i = a_i + a_{i+1} */\ - a0 = _mm_xor_si128(a0, a1);\ - a1 = _mm_xor_si128(a1, a2);\ - a2 = _mm_xor_si128(a2, a3);\ - a3 = _mm_xor_si128(a3, a4);\ - a4 = _mm_xor_si128(a4, a5);\ - a5 = _mm_xor_si128(a5, a6);\ - a6 = _mm_xor_si128(a6, a7);\ - a7 = _mm_xor_si128(a7, b6);\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - b0 = _mm_xor_si128(b0, a4);\ - b1 = _mm_xor_si128(b1, a5);\ - b2 = _mm_xor_si128(b2, a6);\ - b3 = _mm_xor_si128(b3, a7);\ - b4 = _mm_xor_si128(b4, a0);\ - b5 = _mm_xor_si128(b5, a1);\ - b6 = _mm_xor_si128(b6, a2);\ - b7 = _mm_xor_si128(b7, a3);\ - \ - b0 = _mm_xor_si128(b0, a6);\ - b1 = _mm_xor_si128(b1, a7);\ - b2 = _mm_xor_si128(b2, a0);\ - b3 = _mm_xor_si128(b3, a1);\ - b4 = _mm_xor_si128(b4, a2);\ - b5 = _mm_xor_si128(b5, a3);\ - b6 = _mm_xor_si128(b6, a4);\ - b7 = _mm_xor_si128(b7, a5);\ - \ - /* spill values y_4, y_5 to memory */\ - TEMP0 = b0;\ - TEMP1 = b1;\ - TEMP2 = b2;\ - \ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - b0 = a0;\ - b1 = a1;\ - TEMP3 = a2;\ - \ - /* compute x_i = t_i + t_{i+3} */\ - a0 = _mm_xor_si128(a0, a3);\ - a1 = _mm_xor_si128(a1, a4);\ - a2 = _mm_xor_si128(a2, a5);\ - a3 = _mm_xor_si128(a3, a6);\ - a4 = _mm_xor_si128(a4, a7);\ - a5 = _mm_xor_si128(a5, b0);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, TEMP3);\ - \ - /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - b1 = ALL_1B;\ - b2 = _mm_xor_si128(b2, b2);\ - VMUL2(a7, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a0, b0, b1, b2);\ - \ - /* compute w_i : add y_{i+4} */\ - a0 = _mm_xor_si128(a0, TEMP0);\ - a1 = _mm_xor_si128(a1, TEMP1);\ - a2 = _mm_xor_si128(a2, TEMP2);\ - a3 = _mm_xor_si128(a3, b3);\ - a4 = _mm_xor_si128(a4, b4);\ - a5 = _mm_xor_si128(a5, b5);\ - a6 = _mm_xor_si128(a6, b6);\ - a7 = _mm_xor_si128(a7, b7);\ - \ - /*compute v_i: double w_i */\ - VMUL2(a0, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a7, b0, b1, b2);\ - \ - /* add to y_4 y_5 .. v3, v4, ... */\ - b0 = _mm_xor_si128(a3, TEMP0);\ - b1 = _mm_xor_si128(a4, TEMP1);\ - b2 = _mm_xor_si128(a5, TEMP2);\ - b3 = _mm_xor_si128(b3, a6);\ - b4 = _mm_xor_si128(b4, a7);\ - b5 = _mm_xor_si128(b5, a0);\ - b6 = _mm_xor_si128(b6, a1);\ - b7 = _mm_xor_si128(b7, a2);\ -}/*MixBytes*/ - -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* Add Round Constant */\ - b1 = ROUND_CONST_Lx;\ - a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ - a1 = _mm_xor_si128(a1, b1);\ - a2 = _mm_xor_si128(a2, b1);\ - a3 = _mm_xor_si128(a3, b1);\ - a4 = _mm_xor_si128(a4, b1);\ - a5 = _mm_xor_si128(a5, b1);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ - \ - /* ShiftBytes + SubBytes (interleaved) */\ - b0 = _mm_xor_si128(b0, b0);\ - a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ - a0 = _mm_aesenclast_si128(a0, b0);\ - a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ - a1 = _mm_aesenclast_si128(a1, b0);\ - a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ - a2 = _mm_aesenclast_si128(a2, b0);\ - a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ - a3 = _mm_aesenclast_si128(a3, b0);\ - a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ - a4 = _mm_aesenclast_si128(a4, b0);\ - a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ - a5 = _mm_aesenclast_si128(a5, b0);\ - a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ - a6 = _mm_aesenclast_si128(a6, b0);\ - a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ - a7 = _mm_aesenclast_si128(a7, b0);\ - \ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ -} - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - t0 = TRANSP_MASK;\ - \ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ - \ - o1 = _mm_unpackhi_epi16(i0, i1);\ - i0 = _mm_unpacklo_epi16(i0, i1);\ - t0 = _mm_unpackhi_epi16(i2, i3);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - \ - i0 = _mm_shuffle_epi32(i0, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - t0 = _mm_shuffle_epi32(t0, 216);\ - \ - o2 = _mm_unpackhi_epi32(i0, i2);\ - o3 = _mm_unpackhi_epi32(o1, t0);\ - i0 = _mm_unpacklo_epi32(i0, i2);\ - o1 = _mm_unpacklo_epi32(o1, t0);\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - o1 = _mm_unpackhi_epi64(i0, i4);\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - o2 = _mm_unpacklo_epi64(i1, i5);\ - o3 = _mm_unpackhi_epi64(i1, i5);\ - o4 = _mm_unpacklo_epi64(i2, i6);\ - o5 = _mm_unpackhi_epi64(i2, i6);\ - o6 = _mm_unpacklo_epi64(i3, i7);\ - o7 = _mm_unpackhi_epi64(i3, i7);\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - o0 = _mm_unpackhi_epi64(i0, i1);\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - o1 = _mm_unpackhi_epi64(i2, i3);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - o2 = _mm_unpackhi_epi64(i4, i5);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - o3 = _mm_unpackhi_epi64(i6, i7);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - t0 = _mm_xor_si128(t0, t0);\ - i1 = _mm_unpackhi_epi64(i0, t0);\ - i0 = _mm_unpacklo_epi64(i0, t0);\ - i3 = _mm_unpackhi_epi64(i2, t0);\ - i2 = _mm_unpacklo_epi64(i2, t0);\ - i5 = _mm_unpackhi_epi64(i4, t0);\ - i4 = _mm_unpacklo_epi64(i4, t0);\ - i7 = _mm_unpackhi_epi64(i6, t0);\ - i6 = _mm_unpacklo_epi64(i6, t0);\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ -}/**/ - - -void INIT(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; - static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; - - /* load IV into registers xmm12 - xmm15 */ - xmm12 = chaining[0]; - xmm13 = chaining[1]; - xmm14 = chaining[2]; - xmm15 = chaining[3]; - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* store transposed IV */ - chaining[0] = xmm12; - chaining[1] = xmm2; - chaining[2] = xmm6; - chaining[3] = xmm7; -} - -void TF512(u64* h, u64* m) -{ - __m128i* const chaining = (__m128i*) h; - __m128i* const message = (__m128i*) m; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP0; - static __m128i TEMP1; - static __m128i TEMP2; - static __m128i TEMP3; - -#ifdef IACA_TRACE - IACA_START; -#endif - - /* load message into registers xmm12 - xmm15 */ - xmm12 = message[0]; - xmm13 = message[1]; - xmm14 = message[2]; - xmm15 = message[3]; - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* load previous chaining value and xor message to CV to get input of P */ - /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - xmm8 = _mm_xor_si128(xmm12, chaining[0]); - xmm0 = _mm_xor_si128(xmm2, chaining[1]); - xmm4 = _mm_xor_si128(xmm6, chaining[2]); - xmm5 = _mm_xor_si128(xmm7, chaining[3]); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, xmm8); - xmm1 = _mm_xor_si128(xmm1, xmm10); - xmm2 = _mm_xor_si128(xmm2, xmm12); - xmm3 = _mm_xor_si128(xmm3, xmm14); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, chaining[0]); - xmm1 = _mm_xor_si128(xmm1, chaining[1]); - xmm2 = _mm_xor_si128(xmm2, chaining[2]); - xmm3 = _mm_xor_si128(xmm3, chaining[3]); - - /* store CV */ - chaining[0] = xmm0; - chaining[1] = xmm1; - chaining[2] = xmm2; - chaining[3] = xmm3; - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF512(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP0; - static __m128i TEMP1; - static __m128i TEMP2; - static __m128i TEMP3; - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - xmm8 = chaining[0]; - xmm10 = chaining[1]; - xmm12 = chaining[2]; - xmm14 = chaining[3]; - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm10 = _mm_xor_si128(xmm10, (chaining[1])); - xmm12 = _mm_xor_si128(xmm12, (chaining[2])); - xmm14 = _mm_xor_si128(xmm14, (chaining[3])); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); - - /* we only need to return the truncated half of the state */ - chaining[2] = xmm9; - chaining[3] = xmm11; -} - -#endif - -#if (LENGTH > 256) - -#define SET_CONSTANTS(){\ - __m128i xmm0, xmm1;\ - __m256d ymm0;\ - xmm0 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - xmm1 = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - ymm0 = insert_m128i_in_m256d(ymm0, xmm0, 0);\ - ymm0 = insert_m128i_in_m256d(ymm0, xmm1, 1);\ - ALL_1B = ymm0;\ - ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\ - SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\ - SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\ - SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\ - SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\ - for(i = 0; i < ROUNDS1024; i++)\ - {\ - ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ - }\ -}while(0); - -/* AVX MUL2 - * input: i - * output i = 2 * i - * */ -#define VMUL2(i){\ - xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\ - xmmIL = extract_m128i_from_m256d(i, 0);\ - xmmIH = extract_m128i_from_m256d(i, 1);\ - xmmJL = _mm_cmpgt_epi8(xmmZERO, xmmIL);\ - xmmJH = _mm_cmpgt_epi8(xmmZERO, xmmIH);\ - xmmIL = _mm_add_epi8(xmmIL, xmmIL);\ - xmmIH = _mm_add_epi8(xmmIH, xmmIH);\ - ymmJ = insert_m128i_in_m256d(ymmJ, xmmJL, 0);\ - ymmJ = insert_m128i_in_m256d(ymmJ, xmmJH, 1);\ - ymmJ = _mm256_and_pd(ymmJ, ALL_1B);\ - i = insert_m128i_in_m256d(i, xmmIL, 0);\ - i = insert_m128i_in_m256d(i, xmmIH, 1);\ - i = _mm256_xor_pd(i, ymmJ);\ -}/**/ - -/* AVX SubShift - * inputs: - * * i - * * c0 (must be 0) - * * ShiftP - * * ShiftQ - * output i = S(Shift(i_1, ShiftQ)|Shift(i_0, ShiftP)) - * clobbers: t0 - * */ -#define SubShift(i, ShiftP, ShiftQ){\ - xmmZERO = _mm_xor_si128(xmmZERO, xmmZERO);\ - xmmIL = extract_m128i_from_m256d(i, 0);\ - xmmIH = extract_m128i_from_m256d(i, 1);\ - xmmIL = _mm_shuffle_epi8(xmmIL, SUBSH_MASK[ShiftP]);\ - xmmIH = _mm_shuffle_epi8(xmmIH, SUBSH_MASK[ShiftQ]);\ - xmmIL = _mm_aesenclast_si128(xmmIL, xmmZERO);\ - xmmIH = _mm_aesenclast_si128(xmmIH, xmmZERO);\ - i = insert_m128i_in_m256d(i, xmmIL, 0);\ - i = insert_m128i_in_m256d(i, xmmIH, 1);\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ - b0 = a2;\ - b1 = a3;\ - b2 = a4;\ - b3 = a5;\ - b4 = a6;\ - b5 = a7;\ - b6 = a0;\ - b7 = a1;\ - \ - /* t_i = a_i + a_{i+1} */\ - a0 = _mm256_xor_pd(a0, a1);\ - a1 = _mm256_xor_pd(a1, a2);\ - a2 = _mm256_xor_pd(a2, a3);\ - a3 = _mm256_xor_pd(a3, a4);\ - a4 = _mm256_xor_pd(a4, a5);\ - a5 = _mm256_xor_pd(a5, a6);\ - a6 = _mm256_xor_pd(a6, a7);\ - a7 = _mm256_xor_pd(a7, b6);\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - b0 = _mm256_xor_pd(b0, a4);\ - b1 = _mm256_xor_pd(b1, a5);\ - b2 = _mm256_xor_pd(b2, a6);\ - b3 = _mm256_xor_pd(b3, a7);\ - b4 = _mm256_xor_pd(b4, a0);\ - b5 = _mm256_xor_pd(b5, a1);\ - b6 = _mm256_xor_pd(b6, a2);\ - b7 = _mm256_xor_pd(b7, a3);\ - \ - b0 = _mm256_xor_pd(b0, a6);\ - b1 = _mm256_xor_pd(b1, a7);\ - b2 = _mm256_xor_pd(b2, a0);\ - b3 = _mm256_xor_pd(b3, a1);\ - b4 = _mm256_xor_pd(b4, a2);\ - b5 = _mm256_xor_pd(b5, a3);\ - b6 = _mm256_xor_pd(b6, a4);\ - b7 = _mm256_xor_pd(b7, a5);\ - \ - /* spill values y_4, y_5 to memory */\ - TEMP0 = b0;\ - TEMP1 = b1;\ - TEMP2 = b2;\ - \ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - b0 = a0;\ - b1 = a1;\ - TEMP3 = a2;\ - \ - /* compute x_i = t_i + t_{i+3} */\ - a0 = _mm256_xor_pd(a0, a3);\ - a1 = _mm256_xor_pd(a1, a4);\ - a2 = _mm256_xor_pd(a2, a5);\ - a3 = _mm256_xor_pd(a3, a6);\ - a4 = _mm256_xor_pd(a4, a7);\ - a5 = _mm256_xor_pd(a5, b0);\ - a6 = _mm256_xor_pd(a6, b1);\ - a7 = _mm256_xor_pd(a7, TEMP3);\ - \ - /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - b1 = ALL_1B;\ - b2 = _mm256_xor_pd(b2, b2);\ - VMUL2(a7);\ - VMUL2(a6);\ - VMUL2(a5);\ - VMUL2(a4);\ - VMUL2(a3);\ - VMUL2(a2);\ - VMUL2(a1);\ - VMUL2(a0);\ - \ - /* compute w_i : add y_{i+4} */\ - a0 = _mm256_xor_pd(a0, TEMP0);\ - a1 = _mm256_xor_pd(a1, TEMP1);\ - a2 = _mm256_xor_pd(a2, TEMP2);\ - a3 = _mm256_xor_pd(a3, b3);\ - a4 = _mm256_xor_pd(a4, b4);\ - a5 = _mm256_xor_pd(a5, b5);\ - a6 = _mm256_xor_pd(a6, b6);\ - a7 = _mm256_xor_pd(a7, b7);\ - \ - /*compute v_i: double w_i */\ - VMUL2(a0);\ - VMUL2(a1);\ - VMUL2(a2);\ - VMUL2(a3);\ - VMUL2(a4);\ - VMUL2(a5);\ - VMUL2(a6);\ - VMUL2(a7);\ - \ - /* add to y_4 y_5 .. v3, v4, ... */\ - b0 = _mm256_xor_pd(a3, TEMP0);\ - b1 = _mm256_xor_pd(a4, TEMP1);\ - b2 = _mm256_xor_pd(a5, TEMP2);\ - b3 = _mm256_xor_pd(b3, a6);\ - b4 = _mm256_xor_pd(b4, a7);\ - b5 = _mm256_xor_pd(b5, a0);\ - b6 = _mm256_xor_pd(b6, a1);\ - b7 = _mm256_xor_pd(b7, a2);\ -}/*MixBytes*/ - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define SUBSHIFTMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* ShiftBytes + SubBytes */\ - SubShift(a0, 0, 1);\ - SubShift(a1, 1, 3);\ - SubShift(a2, 2, 5);\ - SubShift(a3, 3, 7);\ - SubShift(a4, 4, 0);\ - SubShift(a5, 5, 2);\ - SubShift(a6, 6, 4);\ - SubShift(a7, 7, 6);\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -#define ROUNDS_P_Q(){\ - u8 round_counter = 0;\ - for(round_counter = 0; round_counter < 14; round_counter++) {\ - /* AddRoundConstant */\ - ymm6 = _mm256_xor_pd(ymm6, ymm6);\ - ymm7 = insert_m128i_in_m256d(ymm6, ROUND_CONST_Q[round_counter], 1);\ - ymm6 = insert_m128i_in_m256d(ymm6, ALL_FF, 1);\ - ymm0 = insert_m128i_in_m256d(ymm6, ROUND_CONST_P[round_counter], 0);\ - ymm0 = _mm256_xor_pd(ymm8, ymm0);\ - ymm1 = _mm256_xor_pd(ymm9, ymm6);\ - ymm2 = _mm256_xor_pd(ymm10, ymm6);\ - ymm3 = _mm256_xor_pd(ymm11, ymm6);\ - ymm4 = _mm256_xor_pd(ymm12, ymm6);\ - ymm5 = _mm256_xor_pd(ymm13, ymm6);\ - ymm6 = _mm256_xor_pd(ymm14, ymm6);\ - ymm7 = _mm256_xor_pd(ymm15, ymm7);\ - /* SubBytes + ShiftBytes + MixBytes */\ - SUBSHIFTMIX(ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15);\ - }\ -} - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - t0 = TRANSP_MASK;\ - \ - i6 = _mm_shuffle_epi8(i6, t0);\ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ - i4 = _mm_shuffle_epi8(i4, t0);\ - i5 = _mm_shuffle_epi8(i5, t0);\ - i7 = _mm_shuffle_epi8(i7, t0);\ - \ - /* continue with unpack */\ - t0 = _mm_unpackhi_epi16(i0, i1);\ - t1 = _mm_unpackhi_epi16(i2, i3);\ - t2 = _mm_unpackhi_epi16(i4, i5);\ - t3 = _mm_unpackhi_epi16(i6, i7);\ - i0 = _mm_unpacklo_epi16(i0, i1);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - i4 = _mm_unpacklo_epi16(i4, i5);\ - i6 = _mm_unpacklo_epi16(i6, i7);\ - \ - /* shuffle with immediate */\ - t0 = _mm_shuffle_epi32(t0, 216);\ - t1 = _mm_shuffle_epi32(t1, 216);\ - t2 = _mm_shuffle_epi32(t2, 216);\ - t3 = _mm_shuffle_epi32(t3, 216);\ - i0 = _mm_shuffle_epi32(i0, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - i4 = _mm_shuffle_epi32(i4, 216);\ - i6 = _mm_shuffle_epi32(i6, 216);\ - \ - /* continue with unpack */\ - t4 = _mm_unpackhi_epi32(i0, i2);\ - i0 = _mm_unpacklo_epi32(i0, i2);\ - t5 = _mm_unpackhi_epi32(t0, t1);\ - t0 = _mm_unpacklo_epi32(t0, t1);\ - t6 = _mm_unpackhi_epi32(i4, i6);\ - i4 = _mm_unpacklo_epi32(i4, i6);\ - t7 = _mm_unpackhi_epi32(t2, t3);\ - t2 = _mm_unpacklo_epi32(t2, t3);\ - \ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - i1 = _mm_unpackhi_epi64(i0, i4);\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - i2 = _mm_unpacklo_epi64(t0, t2);\ - i3 = _mm_unpackhi_epi64(t0, t2);\ - i4 = _mm_unpacklo_epi64(t4, t6);\ - i5 = _mm_unpackhi_epi64(t4, t6);\ - i6 = _mm_unpacklo_epi64(t5, t7);\ - i7 = _mm_unpackhi_epi64(t5, t7);\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - o0 = TRANSP_MASK;\ - /* transpose matrix to get output format */\ - o1 = _mm_unpackhi_epi64(i0, i1);\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - t0 = _mm_unpackhi_epi64(i2, i3);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - t1 = _mm_unpackhi_epi64(i4, i5);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - t2 = _mm_unpackhi_epi64(i6, i7);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ - /* load transpose mask into a register, because it will be used 8 times */\ - i0 = _mm_shuffle_epi8(i0, o0);\ - i2 = _mm_shuffle_epi8(i2, o0);\ - i4 = _mm_shuffle_epi8(i4, o0);\ - i6 = _mm_shuffle_epi8(i6, o0);\ - o1 = _mm_shuffle_epi8(o1, o0);\ - t0 = _mm_shuffle_epi8(t0, o0);\ - t1 = _mm_shuffle_epi8(t1, o0);\ - t2 = _mm_shuffle_epi8(t2, o0);\ - /* continue with unpack */\ - t3 = _mm_unpackhi_epi16(i4, i6);\ - i4 = _mm_unpacklo_epi16(i4, i6);\ - o0 = _mm_unpackhi_epi16(i0, i2);\ - i0 = _mm_unpacklo_epi16(i0, i2);\ - o2 = _mm_unpackhi_epi16(o1, t0);\ - o1 = _mm_unpacklo_epi16(o1, t0);\ - t4 = _mm_unpackhi_epi16(t1, t2);\ - t1 = _mm_unpacklo_epi16(t1, t2);\ - /* shuffle with immediate */\ - i4 = _mm_shuffle_epi32(i4, 216);\ - t3 = _mm_shuffle_epi32(t3, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - o2 = _mm_shuffle_epi32(o2, 216);\ - i0 = _mm_shuffle_epi32(i0, 216);\ - o0 = _mm_shuffle_epi32(o0, 216);\ - t1 = _mm_shuffle_epi32(t1, 216);\ - t4 = _mm_shuffle_epi32(t4, 216);\ - /* continue with unpack */\ - i1 = _mm_unpackhi_epi32(i0, i4);\ - i0 = _mm_unpacklo_epi32(i0, i4);\ - i3 = _mm_unpackhi_epi32(o0, t3);\ - o0 = _mm_unpacklo_epi32(o0, t3);\ - i5 = _mm_unpackhi_epi32(o1, t1);\ - o1 = _mm_unpacklo_epi32(o1, t1);\ - i7 = _mm_unpackhi_epi32(o2, t4);\ - o2 = _mm_unpacklo_epi32(o2, t4);\ - /* transpose done */\ -}/**/ - -void INIT(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - - /* load IV into registers xmm8 - xmm15 */ - xmm8 = chaining[0]; - xmm9 = chaining[1]; - xmm10 = chaining[2]; - xmm11 = chaining[3]; - xmm12 = chaining[4]; - xmm13 = chaining[5]; - xmm14 = chaining[6]; - xmm15 = chaining[7]; - - /* transform chaining value from column ordering into row ordering */ - Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - - /* store transposed IV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; -} - -void TF1024(u64* h, u64* m) -{ - __m128i* const chaining = (__m128i*) h; - __m128i* const message = (__m128i*) m; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO; - static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; - static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; - static __m256d ymmJ; - static __m256d TEMP0; - static __m256d TEMP1; - static __m256d TEMP2; - static __m256d TEMP3; - -#ifdef IACA_TRACE - IACA_START; -#endif - - /* load message into registers xmm8 - xmm15 (Q = message) */ - xmm0 = message[0]; - xmm1 = message[1]; - xmm2 = message[2]; - xmm3 = message[3]; - xmm4 = message[4]; - xmm5 = message[5]; - xmm6 = message[6]; - xmm7 = message[7]; - - /* transform message M from column ordering into row ordering */ - Matrix_Transpose(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* load previous chaining value and xor message to CV to get input of P */ - /* we put two rows (2x64 bit) of the CV into one 128-bit xmm register */ - /* result: CV+M in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm0, chaining[0]); - xmm9 = _mm_xor_si128(xmm1, chaining[1]); - xmm10 = _mm_xor_si128(xmm2, chaining[2]); - xmm11 = _mm_xor_si128(xmm3, chaining[3]); - xmm12 = _mm_xor_si128(xmm4, chaining[4]); - xmm13 = _mm_xor_si128(xmm5, chaining[5]); - xmm14 = _mm_xor_si128(xmm6, chaining[6]); - xmm15 = _mm_xor_si128(xmm7, chaining[7]); - - /* generate AVX registers with Q in high and P in low 128 bits */ - ymm8 = insert_m128i_in_m256d(ymm8, xmm8, 0); - ymm9 = insert_m128i_in_m256d(ymm9, xmm9, 0); - ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0); - ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0); - ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0); - ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0); - ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0); - ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0); - - ymm8 = insert_m128i_in_m256d(ymm8, xmm0, 1); - ymm9 = insert_m128i_in_m256d(ymm9, xmm1, 1); - ymm10 = insert_m128i_in_m256d(ymm10, xmm2, 1); - ymm11 = insert_m128i_in_m256d(ymm11, xmm3, 1); - ymm12 = insert_m128i_in_m256d(ymm12, xmm4, 1); - ymm13 = insert_m128i_in_m256d(ymm13, xmm5, 1); - ymm14 = insert_m128i_in_m256d(ymm14, xmm6, 1); - ymm15 = insert_m128i_in_m256d(ymm15, xmm7, 1); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* extract Q to xmm */ - xmm0 = extract_m128i_from_m256d(ymm8, 1); - xmm1 = extract_m128i_from_m256d(ymm9, 1); - xmm2 = extract_m128i_from_m256d(ymm10, 1); - xmm3 = extract_m128i_from_m256d(ymm11, 1); - xmm4 = extract_m128i_from_m256d(ymm12, 1); - xmm5 = extract_m128i_from_m256d(ymm13, 1); - xmm6 = extract_m128i_from_m256d(ymm14, 1); - xmm7 = extract_m128i_from_m256d(ymm15, 1); - - /* extract P to xmm */ - xmm8 = extract_m128i_from_m256d(ymm8, 0); - xmm9 = extract_m128i_from_m256d(ymm9, 0); - xmm10 = extract_m128i_from_m256d(ymm10, 0); - xmm11 = extract_m128i_from_m256d(ymm11, 0); - xmm12 = extract_m128i_from_m256d(ymm12, 0); - xmm13 = extract_m128i_from_m256d(ymm13, 0); - xmm14 = extract_m128i_from_m256d(ymm14, 0); - xmm15 = extract_m128i_from_m256d(ymm15, 0); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, xmm0); - xmm9 = _mm_xor_si128(xmm9, xmm1); - xmm10 = _mm_xor_si128(xmm10, xmm2); - xmm11 = _mm_xor_si128(xmm11, xmm3); - xmm12 = _mm_xor_si128(xmm12, xmm4); - xmm13 = _mm_xor_si128(xmm13, xmm5); - xmm14 = _mm_xor_si128(xmm14, xmm6); - xmm15 = _mm_xor_si128(xmm15, xmm7); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, chaining[0]); - xmm9 = _mm_xor_si128(xmm9, chaining[1]); - xmm10 = _mm_xor_si128(xmm10, chaining[2]); - xmm11 = _mm_xor_si128(xmm11, chaining[3]); - xmm12 = _mm_xor_si128(xmm12, chaining[4]); - xmm13 = _mm_xor_si128(xmm13, chaining[5]); - xmm14 = _mm_xor_si128(xmm14, chaining[6]); - xmm15 = _mm_xor_si128(xmm15, chaining[7]); - - /* store CV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF1024(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i xmmIL, xmmIH, xmmJL, xmmJH, xmmZERO; - static __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; - static __m256d ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; - static __m256d ymmJ; - static __m256d TEMP0; - static __m256d TEMP1; - static __m256d TEMP2; - static __m256d TEMP3; - - /* load CV into registers xmm8...xmm15 */ - xmm8 = chaining[0]; - xmm9 = chaining[1]; - xmm10 = chaining[2]; - xmm11 = chaining[3]; - xmm12 = chaining[4]; - xmm13 = chaining[5]; - xmm14 = chaining[6]; - xmm15 = chaining[7]; - - xmm0 = _mm_xor_si128(xmm0, xmm0); - - /* generate AVX registers with Q in high and P in low 128 bits */ - ymm8 = insert_m128i_in_m256d(ymm8, xmm8, 0); - ymm9 = insert_m128i_in_m256d(ymm9, xmm9, 0); - ymm10 = insert_m128i_in_m256d(ymm10, xmm10, 0); - ymm11 = insert_m128i_in_m256d(ymm11, xmm11, 0); - ymm12 = insert_m128i_in_m256d(ymm12, xmm12, 0); - ymm13 = insert_m128i_in_m256d(ymm13, xmm13, 0); - ymm14 = insert_m128i_in_m256d(ymm14, xmm14, 0); - ymm15 = insert_m128i_in_m256d(ymm15, xmm15, 0); - - ymm8 = insert_m128i_in_m256d(ymm8, xmm0, 1); - ymm9 = insert_m128i_in_m256d(ymm9, xmm0, 1); - ymm10 = insert_m128i_in_m256d(ymm10, xmm0, 1); - ymm11 = insert_m128i_in_m256d(ymm11, xmm0, 1); - ymm12 = insert_m128i_in_m256d(ymm12, xmm0, 1); - ymm13 = insert_m128i_in_m256d(ymm13, xmm0, 1); - ymm14 = insert_m128i_in_m256d(ymm14, xmm0, 1); - ymm15 = insert_m128i_in_m256d(ymm15, xmm0, 1); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8...xmm15 */ - ROUNDS_P_Q(); - - xmm8 = extract_m128i_from_m256d(ymm8, 0); - xmm9 = extract_m128i_from_m256d(ymm9, 0); - xmm10 = extract_m128i_from_m256d(ymm10, 0); - xmm11 = extract_m128i_from_m256d(ymm11, 0); - xmm12 = extract_m128i_from_m256d(ymm12, 0); - xmm13 = extract_m128i_from_m256d(ymm13, 0); - xmm14 = extract_m128i_from_m256d(ymm14, 0); - xmm15 = extract_m128i_from_m256d(ymm15, 0); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, chaining[0]); - xmm9 = _mm_xor_si128(xmm9, chaining[1]); - xmm10 = _mm_xor_si128(xmm10, chaining[2]); - xmm11 = _mm_xor_si128(xmm11, chaining[3]); - xmm12 = _mm_xor_si128(xmm12, chaining[4]); - xmm13 = _mm_xor_si128(xmm13, chaining[5]); - xmm14 = _mm_xor_si128(xmm14, chaining[6]); - xmm15 = _mm_xor_si128(xmm15, chaining[7]); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); - - /* we only need to return the truncated half of the state */ - chaining[0] = xmm8; - chaining[1] = xmm4; - chaining[2] = xmm9; - chaining[3] = xmm11; - chaining[4] = xmm0; - chaining[5] = xmm6; - chaining[6] = xmm13; - chaining[7] = xmm15; - - return; -}//OF1024() - -#endif - diff --git a/algo/groestl/aes_ni/groestl-intr-vperm.h b/algo/groestl/aes_ni/groestl-intr-vperm.h deleted file mode 100644 index c755229..0000000 --- a/algo/groestl/aes_ni/groestl-intr-vperm.h +++ /dev/null @@ -1,1294 +0,0 @@ -/* groestl-intr-vperm.h Aug 2011 - * - * Groestl implementation with intrinsics using ssse3 instructions. - * Author: Günther A. Roland, Martin Schläffer - * - * Based on the vperm and aes_ni implementations of the hash function Groestl - * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ - * Institute of Applied Mathematics, Middle East Technical University, Turkey - * - * This code is placed in the public domain - */ - -#include -#include "hash-groestl.h" - -/* global constants */ -__m128i ROUND_CONST_Lx; -__m128i ROUND_CONST_L0[ROUNDS512]; -__m128i ROUND_CONST_L7[ROUNDS512]; -__m128i ROUND_CONST_P[ROUNDS1024]; -__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_0F; -__m128i ALL_15; -__m128i ALL_1B; -__m128i ALL_63; -__m128i ALL_FF; -__m128i VPERM_IPT[2]; -__m128i VPERM_OPT[2]; -__m128i VPERM_INV[2]; -__m128i VPERM_SB1[2]; -__m128i VPERM_SB2[2]; -__m128i VPERM_SB4[2]; -__m128i VPERM_SBO[2]; - - -#define tos(a) #a -#define tostr(a) tos(a) - -#define SET_SHARED_CONSTANTS(){\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\ - ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\ - ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\ - VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\ - VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\ - VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\ - VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\ - VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\ - VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\ - VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\ - VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\ - VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\ - VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\ - VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\ - VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\ -}/**/ - -/* VPERM - * Transform w/o settings c* - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ - t0 = c0;\ - t1 = c0;\ - t0 = _mm_andnot_si128(t0, a0);\ - t1 = _mm_andnot_si128(t1, a1);\ - t0 = _mm_srli_epi32(t0, 4);\ - t1 = _mm_srli_epi32(t1, 4);\ - a0 = _mm_and_si128(a0, c0);\ - a1 = _mm_and_si128(a1, c0);\ - t2 = c2;\ - t3 = c2;\ - t2 = _mm_shuffle_epi8(t2, a0);\ - t3 = _mm_shuffle_epi8(t3, a1);\ - a0 = c1;\ - a1 = c1;\ - a0 = _mm_shuffle_epi8(a0, t0);\ - a1 = _mm_shuffle_epi8(a1, t1);\ - a0 = _mm_xor_si128(a0, t2);\ - a1 = _mm_xor_si128(a1, t3);\ -}/**/ - -#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ - c0 = ALL_0F;\ - c1 = ((__m128i*) table )[0];\ - c2 = ((__m128i*) table )[1];\ -}/**/ - -/* VPERM - * Transform - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Transform State - * inputs: - * a0-a3 = state - * table = transformation table to use - * t* = clobbers - * outputs: - * a0-a3 = transformed state - * */ -#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ - VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Add Constant to State - * inputs: - * a0-a7 = state - * constant = constant to add - * t0 = clobber - * outputs: - * a0-a7 = state + constant - * */ -#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ - t0 = constant;\ - a0 = _mm_xor_si128(a0, t0);\ - a1 = _mm_xor_si128(a1, t0);\ - a2 = _mm_xor_si128(a2, t0);\ - a3 = _mm_xor_si128(a3, t0);\ - a4 = _mm_xor_si128(a4, t0);\ - a5 = _mm_xor_si128(a5, t0);\ - a6 = _mm_xor_si128(a6, t0);\ - a7 = _mm_xor_si128(a7, t0);\ -}/**/ - -/* VPERM - * Set Substitute Core Constants - * */ -#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ - VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ -}/**/ - -/* VPERM - * Substitute Core - * first part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0 = 1 row - * t*, c* = clobbers - * outputs: - * b0a, b0b = inputs for lookup step - * */ -#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ - t0 = c0;\ - t0 = _mm_andnot_si128(t0, a0);\ - t0 = _mm_srli_epi32(t0, 4);\ - a0 = _mm_and_si128(a0, c0);\ - b0a = c1;\ - b0a = _mm_shuffle_epi8(b0a, a0);\ - a0 = _mm_xor_si128(a0, t0);\ - b0b = c2;\ - b0b = _mm_shuffle_epi8(b0b, t0);\ - b0b = _mm_xor_si128(b0b, b0a);\ - t1 = c2;\ - t1 = _mm_shuffle_epi8(t1, a0);\ - t1 = _mm_xor_si128(t1, b0a);\ - b0a = c2;\ - b0a = _mm_shuffle_epi8(b0a, b0b);\ - b0a = _mm_xor_si128(b0a, a0);\ - b0b = c2;\ - b0b = _mm_shuffle_epi8(b0b, t1);\ - b0b = _mm_xor_si128(b0b, t0);\ -}/**/ - -/* VPERM - * Lookup - * second part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0a, a0b = output of Substitution Core - * table = lookup table to use (*1 / *2 / *4) - * t0 = clobber - * outputs: - * b0 = output of sbox + multiplication - * */ -#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ - b0 = ((__m128i*) table )[0];\ - t0 = ((__m128i*) table )[1];\ - b0 = _mm_shuffle_epi8(b0, a0b);\ - t0 = _mm_shuffle_epi8(t0, a0a);\ - b0 = _mm_xor_si128(b0, t0);\ -}/**/ - -/* VPERM - * SubBytes and *2 / *4 - * this function is derived from: - * Constant-time SSSE3 AES core implementation - * by Mike Hamburg - * and - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0-a7 = state - * t*, c* = clobbers - * outputs: - * a0-a7 = state * 4 - * c2 = row0 * 2 -> b0 - * c1 = row7 * 2 -> b3 - * c0 = row7 * 1 -> b4 - * t2 = row4 * 1 -> b7 - * TEMP_MUL1 = row(i) * 1 - * TEMP_MUL2 = row(i) * 2 - * - * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ -#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ - /* set Constants */\ - VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ - /* row 1 */\ - VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[1] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[1] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ - /* --- */\ - /* row 2 */\ - VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[2] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[2] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ - /* --- */\ - /* row 3 */\ - VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[3] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[3] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ - /* --- */\ - /* row 5 */\ - VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[5] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[5] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ - /* --- */\ - /* row 6 */\ - VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[6] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[6] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ - /* --- */\ - /* row 7 */\ - VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[7] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ - VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ - /* --- */\ - /* row 4 */\ - VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[4] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ - /* --- */\ - /* row 0 */\ - VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ - TEMP_MUL2[0] = c2;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ - /* --- */\ -}/**/ - - -/* Optimized MixBytes - * inputs: - * a0-a7 = (row0-row7) * 4 - * b0 = row0 * 2 - * b3 = row7 * 2 - * b4 = row7 * 1 - * b7 = row4 * 1 - * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 - * output: b0-b7 - * */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* save one value */\ - TEMP_MUL4 = a3;\ - /* 1 */\ - b1 = a0;\ - b1 = _mm_xor_si128(b1, a5);\ - b1 = _mm_xor_si128(b1, b4); /* -> helper! */\ - b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\ - b2 = b1;\ - \ - /* 2 */\ - b5 = a1;\ - b5 = _mm_xor_si128(b5, a4);\ - b5 = _mm_xor_si128(b5, b7); /* -> helper! */\ - b5 = _mm_xor_si128(b5, b3); /* -> helper! */\ - b6 = b5;\ - \ - /* 4 */\ - b7 = _mm_xor_si128(b7, a6);\ - /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\ - b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\ - b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\ - b7 = _mm_xor_si128(b7, b3); /* -> helper! */\ - b2 = _mm_xor_si128(b2, b7);\ - \ - /* 3 */\ - b0 = _mm_xor_si128(b0, a7);\ - b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\ - b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\ - /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\ - b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\ - b3 = b0;\ - b1 = _mm_xor_si128(b1, b0);\ - b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\ - \ - /* 5 */\ - b4 = _mm_xor_si128(b4, a2);\ - /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\ - b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\ - b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\ - b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\ - b3 = _mm_xor_si128(b3, b4);\ - b6 = _mm_xor_si128(b6, b4);\ - \ - /* 6 */\ - a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\ - b4 = _mm_xor_si128(b4, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - b7 = _mm_xor_si128(b7, a3);\ - \ - /* 7 */\ - a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\ - a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\ - b2 = _mm_xor_si128(b2, a1);\ - b3 = _mm_xor_si128(b3, a1);\ - \ - /* 8 */\ - a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\ - a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\ - b6 = _mm_xor_si128(b6, a5);\ - b7 = _mm_xor_si128(b7, a5);\ - \ - /* 9 */\ - a3 = TEMP_MUL1[2];\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\ - b0 = _mm_xor_si128(b0, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - \ - /* 10 */\ - a1 = TEMP_MUL1[6];\ - a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\ - b1 = _mm_xor_si128(b1, a1);\ - b4 = _mm_xor_si128(b4, a1);\ - \ - /* 11 */\ - a5 = TEMP_MUL1[3];\ - a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\ - b1 = _mm_xor_si128(b1, a5);\ - b6 = _mm_xor_si128(b6, a5);\ - \ - /* 12 */\ - a3 = TEMP_MUL1[7];\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\ - b2 = _mm_xor_si128(b2, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - \ - /* 13 */\ - b0 = _mm_xor_si128(b0, (TEMP_MUL4));\ - b0 = _mm_xor_si128(b0, a4);\ - b1 = _mm_xor_si128(b1, a4);\ - b3 = _mm_xor_si128(b3, a6);\ - b4 = _mm_xor_si128(b4, a0);\ - b4 = _mm_xor_si128(b4, a7);\ - b5 = _mm_xor_si128(b5, a0);\ - b7 = _mm_xor_si128(b7, a2);\ -}/**/ - -#if (LENGTH <= 256) - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\ - SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\ - SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\ - SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\ - SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\ - SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\ - SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ - }\ - ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ -}/**/ - -/* vperm: - * transformation before rounds with ipt - * first round add transformed constant - * middle rounds: add constant XOR 0x15...15 - * last round: additionally add 0x15...15 after MB - * transformation after rounds with opt - */ -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant + ShiftBytes (interleaved) */\ - b1 = ROUND_CONST_Lx;\ - a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ - a1 = _mm_xor_si128(a1, b1);\ - a2 = _mm_xor_si128(a2, b1);\ - a3 = _mm_xor_si128(a3, b1);\ - a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ - a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ - a4 = _mm_xor_si128(a4, b1);\ - a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ - a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ - a5 = _mm_xor_si128(a5, b1);\ - a6 = _mm_xor_si128(a6, b1);\ - a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ - a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ - a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ - a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ - a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ - /* SubBytes + Multiplication by 2 and 4 */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ - ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ -} - - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - t0 = TRANSP_MASK;\ -\ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ -\ - o1 = i0;\ - t0 = i2;\ -\ - i0 = _mm_unpacklo_epi16(i0, i1);\ - o1 = _mm_unpackhi_epi16(o1, i1);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - t0 = _mm_unpackhi_epi16(t0, i3);\ -\ - i0 = _mm_shuffle_epi32(i0, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - t0 = _mm_shuffle_epi32(t0, 216);\ -\ - o2 = i0;\ - o3 = o1;\ -\ - i0 = _mm_unpacklo_epi32(i0, i2);\ - o1 = _mm_unpacklo_epi32(o1, t0);\ - o2 = _mm_unpackhi_epi32(o2, i2);\ - o3 = _mm_unpackhi_epi32(o3, t0);\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - o1 = i0;\ - o2 = i1;\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - o1 = _mm_unpackhi_epi64(o1, i4);\ - o3 = i1;\ - o4 = i2;\ - o2 = _mm_unpacklo_epi64(o2, i5);\ - o3 = _mm_unpackhi_epi64(o3, i5);\ - o5 = i2;\ - o6 = i3;\ - o4 = _mm_unpacklo_epi64(o4, i6);\ - o5 = _mm_unpackhi_epi64(o5, i6);\ - o7 = i3;\ - o6 = _mm_unpacklo_epi64(o6, i7);\ - o7 = _mm_unpackhi_epi64(o7, i7);\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - o0 = i0;\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - o0 = _mm_unpackhi_epi64(o0, i1);\ - o1 = i2;\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - o1 = _mm_unpackhi_epi64(o1, i3);\ - o2 = i4;\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - o2 = _mm_unpackhi_epi64(o2, i5);\ - o3 = i6;\ - i6 = _mm_unpacklo_epi64(i6, i7);\ - o3 = _mm_unpackhi_epi64(o3, i7);\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - t0 = _mm_xor_si128(t0, t0);\ - i1 = i0;\ - i3 = i2;\ - i5 = i4;\ - i7 = i6;\ - i0 = _mm_unpacklo_epi64(i0, t0);\ - i1 = _mm_unpackhi_epi64(i1, t0);\ - i2 = _mm_unpacklo_epi64(i2, t0);\ - i3 = _mm_unpackhi_epi64(i3, t0);\ - i4 = _mm_unpacklo_epi64(i4, t0);\ - i5 = _mm_unpackhi_epi64(i5, t0);\ - i6 = _mm_unpacklo_epi64(i6, t0);\ - i7 = _mm_unpackhi_epi64(i7, t0);\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ -}/**/ - - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - xmm0 = ROUND_CONST_L0[i];\ - xmm1 = ROUND_CONST_L7[i];\ - xmm2 = ROUND_CONST_L0[j];\ - xmm3 = ROUND_CONST_L7[j];\ - VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ - xmm1 = _mm_xor_si128(xmm1, (ALL_15));\ - xmm2 = _mm_xor_si128(xmm2, (ALL_15));\ - xmm3 = _mm_xor_si128(xmm3, (ALL_15));\ - ROUND_CONST_L0[i] = xmm0;\ - ROUND_CONST_L7[i] = xmm1;\ - ROUND_CONST_L0[j] = xmm2;\ - ROUND_CONST_L7[j] = xmm3;\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - xmm0 = ROUND_CONST_Lx;\ - VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ - ROUND_CONST_Lx = xmm0;\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ -}/**/ - -void INIT(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15; - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm12 - xmm15 */ - xmm12 = chaining[0]; - xmm13 = chaining[1]; - xmm14 = chaining[2]; - xmm15 = chaining[3]; - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* store transposed IV */ - chaining[0] = xmm12; - chaining[1] = xmm2; - chaining[2] = xmm6; - chaining[3] = xmm7; -} - -void TF512(u64* h, u64* m) -{ - __m128i* const chaining = (__m128i*) h; - __m128i* const message = (__m128i*) m; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - -#ifdef IACA_TRACE - IACA_START; -#endif - - /* load message into registers xmm12 - xmm15 */ - xmm12 = message[0]; - xmm13 = message[1]; - xmm14 = message[2]; - xmm15 = message[3]; - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - xmm8 = chaining[0]; - xmm0 = chaining[1]; - xmm4 = chaining[2]; - xmm5 = chaining[3]; - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - xmm8 = _mm_xor_si128(xmm8, xmm12); - xmm0 = _mm_xor_si128(xmm0, xmm2); - xmm4 = _mm_xor_si128(xmm4, xmm6); - xmm5 = _mm_xor_si128(xmm5, xmm7); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, xmm8); - xmm1 = _mm_xor_si128(xmm1, xmm10); - xmm2 = _mm_xor_si128(xmm2, xmm12); - xmm3 = _mm_xor_si128(xmm3, xmm14); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, (chaining[0])); - xmm1 = _mm_xor_si128(xmm1, (chaining[1])); - xmm2 = _mm_xor_si128(xmm2, (chaining[2])); - xmm3 = _mm_xor_si128(xmm3, (chaining[3])); - - /* store CV */ - chaining[0] = xmm0; - chaining[1] = xmm1; - chaining[2] = xmm2; - chaining[3] = xmm3; - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF512(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - xmm8 = chaining[0]; - xmm10 = chaining[1]; - xmm12 = chaining[2]; - xmm14 = chaining[3]; - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm10 = _mm_xor_si128(xmm10, (chaining[1])); - xmm12 = _mm_xor_si128(xmm12, (chaining[2])); - xmm14 = _mm_xor_si128(xmm14, (chaining[3])); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); - VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); - - /* we only need to return the truncated half of the state */ - chaining[2] = xmm9; - chaining[3] = xmm11; - - return; -}//OF512() - -#endif - -#if (LENGTH > 256) - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ - SUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\ - SUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\ - SUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\ - SUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\ - SUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\ - SUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\ - SUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\ - for(i = 0; i < ROUNDS1024; i++)\ - {\ - ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ - }\ -}/**/ - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* SubBytes + Multiplication */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -#define ROUNDS_P(){\ - u8 round_counter = 0;\ - for(round_counter = 0; round_counter < 14; round_counter+=2) {\ - /* AddRoundConstant P1024 */\ - xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\ - /* SubBytes + MixBytes */\ - SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - VPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ALL_15, xmm8);\ - \ - /* AddRoundConstant P1024 */\ - xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\ - /* SubBytes + MixBytes */\ - SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ - }\ -}/**/ - -#define ROUNDS_Q(){\ - VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\ - u8 round_counter = 0;\ - for(round_counter = 0; round_counter < 14; round_counter+=2) {\ - /* AddRoundConstant Q1024 */\ - xmm1 = ALL_FF;\ - xmm8 = _mm_xor_si128(xmm8, xmm1);\ - xmm9 = _mm_xor_si128(xmm9, xmm1);\ - xmm10 = _mm_xor_si128(xmm10, xmm1);\ - xmm11 = _mm_xor_si128(xmm11, xmm1);\ - xmm12 = _mm_xor_si128(xmm12, xmm1);\ - xmm13 = _mm_xor_si128(xmm13, xmm1);\ - xmm14 = _mm_xor_si128(xmm14, xmm1);\ - xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\ - /* SubBytes + MixBytes */\ - SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - \ - /* AddRoundConstant Q1024 */\ - xmm9 = ALL_FF;\ - xmm0 = _mm_xor_si128(xmm0, xmm9);\ - xmm1 = _mm_xor_si128(xmm1, xmm9);\ - xmm2 = _mm_xor_si128(xmm2, xmm9);\ - xmm3 = _mm_xor_si128(xmm3, xmm9);\ - xmm4 = _mm_xor_si128(xmm4, xmm9);\ - xmm5 = _mm_xor_si128(xmm5, xmm9);\ - xmm6 = _mm_xor_si128(xmm6, xmm9);\ - xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\ - /* SubBytes + MixBytes*/ \ - SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - }\ - VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm1);\ -}/**/ - - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - t0 = TRANSP_MASK;\ -\ - i6 = _mm_shuffle_epi8(i6, t0);\ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ - t1 = i2;\ - i4 = _mm_shuffle_epi8(i4, t0);\ - i5 = _mm_shuffle_epi8(i5, t0);\ - t2 = i4;\ - t3 = i6;\ - i7 = _mm_shuffle_epi8(i7, t0);\ -\ - /* continue with unpack using 4 temp registers */\ - t0 = i0;\ - t2 = _mm_unpackhi_epi16(t2, i5);\ - i4 = _mm_unpacklo_epi16(i4, i5);\ - t3 = _mm_unpackhi_epi16(t3, i7);\ - i6 = _mm_unpacklo_epi16(i6, i7);\ - t0 = _mm_unpackhi_epi16(t0, i1);\ - t1 = _mm_unpackhi_epi16(t1, i3);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - i0 = _mm_unpacklo_epi16(i0, i1);\ -\ - /* shuffle with immediate */\ - t0 = _mm_shuffle_epi32(t0, 216);\ - t1 = _mm_shuffle_epi32(t1, 216);\ - t2 = _mm_shuffle_epi32(t2, 216);\ - t3 = _mm_shuffle_epi32(t3, 216);\ - i0 = _mm_shuffle_epi32(i0, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - i4 = _mm_shuffle_epi32(i4, 216);\ - i6 = _mm_shuffle_epi32(i6, 216);\ -\ - /* continue with unpack */\ - t4 = i0;\ - i0 = _mm_unpacklo_epi32(i0, i2);\ - t4 = _mm_unpackhi_epi32(t4, i2);\ - t5 = t0;\ - t0 = _mm_unpacklo_epi32(t0, t1);\ - t5 = _mm_unpackhi_epi32(t5, t1);\ - t6 = i4;\ - i4 = _mm_unpacklo_epi32(i4, i6);\ - t7 = t2;\ - t6 = _mm_unpackhi_epi32(t6, i6);\ - i2 = t0;\ - t2 = _mm_unpacklo_epi32(t2, t3);\ - i3 = t0;\ - t7 = _mm_unpackhi_epi32(t7, t3);\ -\ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - i1 = i0;\ - i1 = _mm_unpackhi_epi64(i1, i4);\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - i4 = t4;\ - i3 = _mm_unpackhi_epi64(i3, t2);\ - i5 = t4;\ - i2 = _mm_unpacklo_epi64(i2, t2);\ - i6 = t5;\ - i5 = _mm_unpackhi_epi64(i5, t6);\ - i7 = t5;\ - i4 = _mm_unpacklo_epi64(i4, t6);\ - i7 = _mm_unpackhi_epi64(i7, t7);\ - i6 = _mm_unpacklo_epi64(i6, t7);\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - /* transpose matrix to get output format */\ - o1 = i0;\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - o1 = _mm_unpackhi_epi64(o1, i1);\ - t0 = i2;\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - t0 = _mm_unpackhi_epi64(t0, i3);\ - t1 = i4;\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - t1 = _mm_unpackhi_epi64(t1, i5);\ - t2 = i6;\ - o0 = TRANSP_MASK;\ - i6 = _mm_unpacklo_epi64(i6, i7);\ - t2 = _mm_unpackhi_epi64(t2, i7);\ - /* load transpose mask into a register, because it will be used 8 times */\ - i0 = _mm_shuffle_epi8(i0, o0);\ - i2 = _mm_shuffle_epi8(i2, o0);\ - i4 = _mm_shuffle_epi8(i4, o0);\ - i6 = _mm_shuffle_epi8(i6, o0);\ - o1 = _mm_shuffle_epi8(o1, o0);\ - t0 = _mm_shuffle_epi8(t0, o0);\ - t1 = _mm_shuffle_epi8(t1, o0);\ - t2 = _mm_shuffle_epi8(t2, o0);\ - /* continue with unpack using 4 temp registers */\ - t3 = i4;\ - o2 = o1;\ - o0 = i0;\ - t4 = t1;\ - \ - t3 = _mm_unpackhi_epi16(t3, i6);\ - i4 = _mm_unpacklo_epi16(i4, i6);\ - o0 = _mm_unpackhi_epi16(o0, i2);\ - i0 = _mm_unpacklo_epi16(i0, i2);\ - o2 = _mm_unpackhi_epi16(o2, t0);\ - o1 = _mm_unpacklo_epi16(o1, t0);\ - t4 = _mm_unpackhi_epi16(t4, t2);\ - t1 = _mm_unpacklo_epi16(t1, t2);\ - /* shuffle with immediate */\ - i4 = _mm_shuffle_epi32(i4, 216);\ - t3 = _mm_shuffle_epi32(t3, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - o2 = _mm_shuffle_epi32(o2, 216);\ - i0 = _mm_shuffle_epi32(i0, 216);\ - o0 = _mm_shuffle_epi32(o0, 216);\ - t1 = _mm_shuffle_epi32(t1, 216);\ - t4 = _mm_shuffle_epi32(t4, 216);\ - /* continue with unpack */\ - i1 = i0;\ - i3 = o0;\ - i5 = o1;\ - i7 = o2;\ - i0 = _mm_unpacklo_epi32(i0, i4);\ - i1 = _mm_unpackhi_epi32(i1, i4);\ - o0 = _mm_unpacklo_epi32(o0, t3);\ - i3 = _mm_unpackhi_epi32(i3, t3);\ - o1 = _mm_unpacklo_epi32(o1, t1);\ - i5 = _mm_unpackhi_epi32(i5, t1);\ - o2 = _mm_unpacklo_epi32(o2, t4);\ - i7 = _mm_unpackhi_epi32(i7, t4);\ - /* transpose done */\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - xmm0 = ROUND_CONST_P[i];\ - xmm1 = ROUND_CONST_P[j];\ - xmm2 = ROUND_CONST_Q[i];\ - xmm3 = ROUND_CONST_Q[j];\ - VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm2 = _mm_xor_si128(xmm2, (ALL_15));\ - xmm3 = _mm_xor_si128(xmm3, (ALL_15));\ - ROUND_CONST_P[i] = xmm0;\ - ROUND_CONST_P[j] = xmm1;\ - ROUND_CONST_Q[i] = xmm2;\ - ROUND_CONST_Q[j] = xmm3;\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ - VPERM_Transform_RoundConst_CNT2(10, 11);\ - VPERM_Transform_RoundConst_CNT2(12, 13);\ - xmm0 = ALL_FF;\ - VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ - ALL_FF = xmm0;\ -}/**/ - - -void INIT(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm8 - xmm15 */ - xmm8 = chaining[0]; - xmm9 = chaining[1]; - xmm10 = chaining[2]; - xmm11 = chaining[3]; - xmm12 = chaining[4]; - xmm13 = chaining[5]; - xmm14 = chaining[6]; - xmm15 = chaining[7]; - - /* transform chaining value from column ordering into row ordering */ - VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - - /* store transposed IV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; -} - -void TF1024(u64* h, u64* m) -{ - __m128i* const chaining = (__m128i*) h; - __m128i* const message = (__m128i*) m; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - static __m128i QTEMP[8]; - -#ifdef IACA_TRACE - IACA_START; -#endif - - /* load message into registers xmm8 - xmm15 (Q = message) */ - xmm8 = message[0]; - xmm9 = message[1]; - xmm10 = message[2]; - xmm11 = message[3]; - xmm12 = message[4]; - xmm13 = message[5]; - xmm14 = message[6]; - xmm15 = message[7]; - - /* transform message M from column ordering into row ordering */ - VPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - - /* store message M (Q input) for later */ - QTEMP[0] = xmm8; - QTEMP[1] = xmm9; - QTEMP[2] = xmm10; - QTEMP[3] = xmm11; - QTEMP[4] = xmm12; - QTEMP[5] = xmm13; - QTEMP[6] = xmm14; - QTEMP[7] = xmm15; - - /* xor CV to message to get P input */ - /* result: CV+M in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* compute permutation P */ - /* result: P(CV+M) in xmm8...xmm15 */ - ROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV+M)+CV in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* store P(CV+M)+CV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; - - /* load message M (Q input) into xmm8-15 */ - xmm8 = QTEMP[0]; - xmm9 = QTEMP[1]; - xmm10 = QTEMP[2]; - xmm11 = QTEMP[3]; - xmm12 = QTEMP[4]; - xmm13 = QTEMP[5]; - xmm14 = QTEMP[6]; - xmm15 = QTEMP[7]; - - /* compute permutation Q */ - /* result: Q(M) in xmm8...xmm15 */ - ROUNDS_Q(); - - /* xor Q output */ - /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* store CV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF1024(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - - /* load CV into registers xmm8 - xmm15 */ - xmm8 = chaining[0]; - xmm9 = chaining[1]; - xmm10 = chaining[2]; - xmm11 = chaining[3]; - xmm12 = chaining[4]; - xmm13 = chaining[5]; - xmm14 = chaining[6]; - xmm15 = chaining[7]; - - /* compute permutation P */ - /* result: P(CV) in xmm8...xmm15 */ - ROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); - VPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, VPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12); - - /* we only need to return the truncated half of the state */ - chaining[4] = xmm0; - chaining[5] = xmm6; - chaining[6] = xmm13; - chaining[7] = xmm15; - - return; -} - -#endif - diff --git a/algo/groestl/aes_ni/groestl-version.h b/algo/groestl/aes_ni/groestl-version.h deleted file mode 100644 index 26736ec..0000000 --- a/algo/groestl/aes_ni/groestl-version.h +++ /dev/null @@ -1,10 +0,0 @@ -// specify assembly or intrinsics implementation -//#define TASM -#define TINTR - -// Not to be confused with AVX512VAES -#define VAES -// #define VAVX -// #define VVPERM - -//#endif diff --git a/algo/groestl/aes_ni/groestl256-asm-aes.h b/algo/groestl/aes_ni/groestl256-asm-aes.h deleted file mode 100644 index 0810b5e..0000000 --- a/algo/groestl/aes_ni/groestl256-asm-aes.h +++ /dev/null @@ -1,529 +0,0 @@ -/* groestl-asm-aes.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3, sse4.1, and aes - * instructions. - * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * This code is placed in the public domain - */ - -#include "hash-groestl256.h" -/* global constants */ -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; -__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; -__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; -__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; - -/* temporary variables */ -__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP[3*16]; - - -#define tos(a) #a -#define tostr(a) tos(a) - - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b */ -#define MUL2(i, j, k){\ - asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\ - asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\ - asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\ - asm("pand xmm"tostr(j)", xmm"tostr(k)"");\ - asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* t_i = a_i + a_{i+1} */\ - asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ - asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\ - asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\ - asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\ - asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\ - asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\ - asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\ - asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\ - /* spill values y_4, y_5 to memory */\ - asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\ - asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ - asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ - asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* compute x_i = t_i + t_{i+3} */\ - asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\ - \ - /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - /* compute w_i : add y_{i+4} */\ - asm("movaps xmm"tostr(b1)", [ALL_1B]");\ - MUL2(a0, b0, b1);\ - asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\ - MUL2(a1, b0, b1);\ - asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\ - MUL2(a2, b0, b1);\ - asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\ - MUL2(a3, b0, b1);\ - asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\ - MUL2(a4, b0, b1);\ - asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\ - MUL2(a5, b0, b1);\ - asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\ - MUL2(a6, b0, b1);\ - asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\ - MUL2(a7, b0, b1);\ - asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\ - \ - /* compute v_i : double w_i */\ - /* add to y_4 y_5 .. v3, v4, ... */\ - MUL2(a0, b0, b1);\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ - MUL2(a1, b0, b1);\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\ - MUL2(a2, b0, b1);\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ - MUL2(a5, b0, b1);\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\ - MUL2(a6, b0, b1);\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ - MUL2(a7, b0, b1);\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ - MUL2(a3, b0, b1);\ - MUL2(a4, b0, b1);\ - asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\ - asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ -}/*MixBytes*/ - -#define SET_CONSTANTS(){\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ - ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ - ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}while(0); - -#define Push_All_Regs() do{\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}while(0); - -#define Pop_All_Regs() do{\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}while(0); - -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant */\ - asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ - asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ - /* ShiftBytes + SubBytes (interleaved) */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ - asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ - asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ - asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ - asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ - asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ - asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ - asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ - asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ -} - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ - \ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ - \ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ - \ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ - \ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - \ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ - \ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ - asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -void INIT256(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm12 - xmm15 */ - asm ("movaps xmm12, [rdi+0*16]"); - asm ("movaps xmm13, [rdi+1*16]"); - asm ("movaps xmm14, [rdi+2*16]"); - asm ("movaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm12"); - asm ("movaps [rdi+1*16], xmm2"); - asm ("movaps [rdi+2*16], xmm6"); - asm ("movaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("movaps xmm12, [rsi+0*16]"); - asm ("movaps xmm13, [rsi+1*16]"); - asm ("movaps xmm14, [rsi+2*16]"); - asm ("movaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm0, [rdi+1*16]"); - asm ("movaps xmm4, [rdi+2*16]"); - asm ("movaps xmm5, [rdi+3*16]"); - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("pxor xmm8, xmm12"); - asm ("pxor xmm0, xmm2"); - asm ("pxor xmm4, xmm6"); - asm ("pxor xmm5, xmm7"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("pxor xmm0, xmm8"); - asm ("pxor xmm1, xmm10"); - asm ("pxor xmm2, xmm12"); - asm ("pxor xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("pxor xmm0, [rdi+0*16]"); - asm ("pxor xmm1, [rdi+1*16]"); - asm ("pxor xmm2, [rdi+2*16]"); - asm ("pxor xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm0"); - asm ("movaps [rdi+1*16], xmm1"); - asm ("movaps [rdi+2*16], xmm2"); - asm ("movaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm10, [rdi+1*16]"); - asm ("movaps xmm12, [rdi+2*16]"); - asm ("movaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm10, [rdi+1*16]"); - asm ("pxor xmm12, [rdi+2*16]"); - asm ("pxor xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+2*16], xmm9"); - asm ("movaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - diff --git a/algo/groestl/aes_ni/groestl256-asm-avx.h b/algo/groestl/aes_ni/groestl256-asm-avx.h deleted file mode 100644 index e7cb4c7..0000000 --- a/algo/groestl/aes_ni/groestl256-asm-avx.h +++ /dev/null @@ -1,519 +0,0 @@ -/* groestl-asm-avx.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx - * instructions. - * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * This code is placed in the public domain - */ - -#include "hash-groestl256.h" - -/* global variables */ -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; -__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; -__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16]; -__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16]; -__attribute__ ((aligned (32))) unsigned char ALL_1B[32]; -__attribute__ ((aligned (32))) unsigned char ALL_FF[32]; - -/* temporary variables */ -__attribute__ ((aligned (32))) unsigned char TEMP[6*32]; - - -#define tos(a) #a -#define tostr(a) tos(a) - -#define SET_CONSTANTS(){\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\ - ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\ - ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}while(0); - -#define Push_All_Regs() do{\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}while(0); - -#define Pop_All_Regs() do{\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}while(0); - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2(i, j, k, z){\ - asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\ - asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ - asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\ - asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2v2(i, j, k, z){\ - asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\ - asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\ - asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ - asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ - asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ - asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ - asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ - asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ - asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ - asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ - asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ - \ - /* t_i = a_i + a_{i+1} */\ - asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\ - asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\ - asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\ - asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\ - asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\ - \ - asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\ - asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\ - asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\ - asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\ - \ - /* spill values y_4, y_5 to memory */\ - asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\ - asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\ - asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\ - \ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ - asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\ - \ - /* compute x_i = t_i + t_{i+3} */\ - asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\ - asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\ - asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\ - asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\ - \ - /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\ - VMUL2(a7, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a0, b0, b1, b2);\ - \ - /* compute w_i : add y_{i+4} */\ - asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\ - asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\ - asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\ - asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\ - asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\ - asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\ - asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\ - asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\ - \ - /*compute v_i: double w_i */\ - VMUL2(a0, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a7, b0, b1, b2);\ - \ - /* add to y_4 y_5 .. v3, v4, ... */\ - asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\ - asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\ - asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\ - asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\ - asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\ - asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\ - asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\ - asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\ -}/*MixBytes*/ - -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant */\ - asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ - asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ - /* ShiftBytes + SubBytes (interleaved) */\ - asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ - asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ - asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ - asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ - asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ - asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ - asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ - asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\ - asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ - asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ -} - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\ -\ - asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ -\ - asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ -\ - asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\ - asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\ - asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\ - asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\ - asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\ - asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -void INIT256(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm12 - xmm15 */ - asm ("vmovaps xmm12, [rdi+0*16]"); - asm ("vmovaps xmm13, [rdi+1*16]"); - asm ("vmovaps xmm14, [rdi+2*16]"); - asm ("vmovaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("vmovaps [rdi+0*16], xmm12"); - asm ("vmovaps [rdi+1*16], xmm2"); - asm ("vmovaps [rdi+2*16], xmm6"); - asm ("vmovaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("vmovaps xmm12, [rsi+0*16]"); - asm ("vmovaps xmm13, [rsi+1*16]"); - asm ("vmovaps xmm14, [rsi+2*16]"); - asm ("vmovaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value and xor message to CV to get input of P */ - /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("vpxor xmm8, xmm12, [rdi+0*16]"); - asm ("vpxor xmm0, xmm2, [rdi+1*16]"); - asm ("vpxor xmm4, xmm6, [rdi+2*16]"); - asm ("vpxor xmm5, xmm7, [rdi+3*16]"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("vpxor xmm0, xmm0, xmm8"); - asm ("vpxor xmm1, xmm1, xmm10"); - asm ("vpxor xmm2, xmm2, xmm12"); - asm ("vpxor xmm3, xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("vpxor xmm0, xmm0, [rdi+0*16]"); - asm ("vpxor xmm1, xmm1, [rdi+1*16]"); - asm ("vpxor xmm2, xmm2, [rdi+2*16]"); - asm ("vpxor xmm3, xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("vmovaps [rdi+0*16], xmm0"); - asm ("vmovaps [rdi+1*16], xmm1"); - asm ("vmovaps [rdi+2*16], xmm2"); - asm ("vmovaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("vmovaps xmm8, [rdi+0*16]"); - asm ("vmovaps xmm10, [rdi+1*16]"); - asm ("vmovaps xmm12, [rdi+2*16]"); - asm ("vmovaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("vpxor xmm8, xmm8, [rdi+0*16]"); - asm ("vpxor xmm10, xmm10, [rdi+1*16]"); - asm ("vpxor xmm12, xmm12, [rdi+2*16]"); - asm ("vpxor xmm14, xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - - /* we only need to return the truncated half of the state */ - asm ("vmovaps [rdi+2*16], xmm9"); - asm ("vmovaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - diff --git a/algo/groestl/aes_ni/groestl256-asm-vperm.h b/algo/groestl/aes_ni/groestl256-asm-vperm.h deleted file mode 100644 index a25ade7..0000000 --- a/algo/groestl/aes_ni/groestl256-asm-vperm.h +++ /dev/null @@ -1,856 +0,0 @@ -/* groestl-asm-vperm.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3 instructions. - * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * Based on the vperm and aes_ni implementations of the hash function Groestl - * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ - * Institute of Applied Mathematics, Middle East Technical University, Turkey - * - * This code is placed in the public domain - */ - -#include "hash-groestl256.h" - -/* global constants */ -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; -__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; -__attribute__ ((aligned (16))) unsigned char ALL_0F[16]; -__attribute__ ((aligned (16))) unsigned char ALL_15[16]; -__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; -__attribute__ ((aligned (16))) unsigned char ALL_63[16]; -__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; -__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16]; - -/* temporary variables */ -__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16]; -__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP[8*16]; - - -#define tos(a) #a -#define tostr(a) tos(a) - -#define SET_SHARED_CONSTANTS(){\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\ - ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\ - ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\ - ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\ - ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\ - ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\ - ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\ - ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\ - ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\ - ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\ - ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\ - ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\ - ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\ - ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\ - ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\ - ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\ - ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\ - ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\ - ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\ - ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\ - ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\ - ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\ - ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\ - ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\ - ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\ - ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\ - ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\ - ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\ -/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\ - ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\ - ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\ - ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\ - ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\ - ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\ -}/**/ - -/* VPERM - * Transform w/o settings c* - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\ - asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ - asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\ - asm ("psrld xmm"tostr(t0)", 4");\ - asm ("psrld xmm"tostr(t1)", 4");\ - asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ - asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\ - asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\ - asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\ - asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\ - asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\ -}/**/ - -#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ - asm ("movaps xmm"tostr(c0)", [ALL_0F]");\ - asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\ - asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\ -}/**/ - -/* VPERM - * Transform - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Transform State - * inputs: - * a0-a3 = state - * table = transformation table to use - * t* = clobbers - * outputs: - * a0-a3 = transformed state - * */ -#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ - VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Add Constant to State - * inputs: - * a0-a7 = state - * constant = constant to add - * t0 = clobber - * outputs: - * a0-a7 = state + constant - * */ -#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ - asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * Set Substitute Core Constants - * */ -#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ - VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ -}/**/ - -/* VPERM - * Substitute Core - * first part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0 = 1 row - * t*, c* = clobbers - * outputs: - * b0a, b0b = inputs for lookup step - * */ -#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ - asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ - asm ("psrld xmm"tostr(t0)", 4");\ - asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\ - asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\ - asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\ - asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\ - asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\ - asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * Lookup - * second part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0a, a0b = output of Substitution Core - * table = lookup table to use (*1 / *2 / *4) - * t0 = clobber - * outputs: - * b0 = output of sbox + multiplication - * */ -#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ - asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\ - asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\ - asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\ - asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * SubBytes and *2 / *4 - * this function is derived from: - * Constant-time SSSE3 AES core implementation - * by Mike Hamburg - * and - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0-a7 = state - * t*, c* = clobbers - * outputs: - * a0-a7 = state * 4 - * c2 = row0 * 2 -> b0 - * c1 = row7 * 2 -> b3 - * c0 = row7 * 1 -> b4 - * t2 = row4 * 1 -> b7 - * TEMP_MUL1 = row(i) * 1 - * TEMP_MUL2 = row(i) * 2 - * - * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ -#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ - /* set Constants */\ - VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ - /* row 1 */\ - VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ - /* --- */\ - /* row 2 */\ - VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ - /* --- */\ - /* row 3 */\ - VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ - /* --- */\ - /* row 5 */\ - VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ - /* --- */\ - /* row 6 */\ - VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ - /* --- */\ - /* row 7 */\ - VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ - VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ - /* --- */\ - /* row 4 */\ - VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ - /* --- */\ - /* row 0 */\ - VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ - asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ - /* --- */\ -}/**/ - - -/* Optimized MixBytes - * inputs: - * a0-a7 = (row0-row7) * 4 - * b0 = row0 * 2 - * b3 = row7 * 2 - * b4 = row7 * 1 - * b7 = row4 * 1 - * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 - * output: b0-b7 - * */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* save one value */\ - asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\ - /* 1 */\ - asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\ - asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\ - \ - /* 2 */\ - asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\ - asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\ - \ - /* 4 */\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\ - /*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\ - asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\ - \ - /* 3 */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\ - /*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\ - asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\ - \ - /* 5 */\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ - /*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\ - \ - /* 6 */\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ - \ - /* 7 */\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ - \ - /* 8 */\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ - \ - /* 9 */\ - asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* 10 */\ - asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\ - \ - /* 11 */\ - asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ - \ - /* 12 */\ - asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* 13 */\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ -}/**/ - -//#if (LENGTH <= 256) - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\ - ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\ - ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}/**/ - -#define Push_All_Regs(){\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}/**/ - -#define Pop_All_Regs(){\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}/**/ - - -/* vperm: - * transformation before rounds with ipt - * first round add transformed constant - * middle rounds: add constant XOR 0x15...15 - * last round: additionally add 0x15...15 after MB - * transformation after rounds with opt - */ -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant + ShiftBytes (interleaved) */\ - asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ - asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ - asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ - asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ - /* SubBytes + Multiplication by 2 and 4 */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ - ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ -} - - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ -\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ -\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ -\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ -\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ -\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ - asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\ - asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\ - asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\ - VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("pxor xmm1, [ALL_15]");\ - asm ("pxor xmm2, [ALL_15]");\ - asm ("pxor xmm3, [ALL_15]");\ - asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\ - asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\ - asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\ - asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - asm ("movaps xmm0, [ROUND_CONST_Lx]");\ - VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("movaps [ROUND_CONST_Lx], xmm0");\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ -}/**/ - -void INIT256(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm12 - xmm15 */ - asm ("movaps xmm12, [rdi+0*16]"); - asm ("movaps xmm13, [rdi+1*16]"); - asm ("movaps xmm14, [rdi+2*16]"); - asm ("movaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm12"); - asm ("movaps [rdi+1*16], xmm2"); - asm ("movaps [rdi+2*16], xmm6"); - asm ("movaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("movaps xmm12, [rsi+0*16]"); - asm ("movaps xmm13, [rsi+1*16]"); - asm ("movaps xmm14, [rsi+2*16]"); - asm ("movaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm0, [rdi+1*16]"); - asm ("movaps xmm4, [rdi+2*16]"); - asm ("movaps xmm5, [rdi+3*16]"); - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("pxor xmm8, xmm12"); - asm ("pxor xmm0, xmm2"); - asm ("pxor xmm4, xmm6"); - asm ("pxor xmm5, xmm7"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("pxor xmm0, xmm8"); - asm ("pxor xmm1, xmm10"); - asm ("pxor xmm2, xmm12"); - asm ("pxor xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("pxor xmm0, [rdi+0*16]"); - asm ("pxor xmm1, [rdi+1*16]"); - asm ("pxor xmm2, [rdi+2*16]"); - asm ("pxor xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm0"); - asm ("movaps [rdi+1*16], xmm1"); - asm ("movaps [rdi+2*16], xmm2"); - asm ("movaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm10, [rdi+1*16]"); - asm ("movaps xmm12, [rdi+2*16]"); - asm ("movaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm10, [rdi+1*16]"); - asm ("pxor xmm12, [rdi+2*16]"); - asm ("pxor xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+2*16], xmm9"); - asm ("movaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - - diff --git a/algo/groestl/aes_ni/groestl256-intr-aes.h b/algo/groestl/aes_ni/groestl256-intr-aes.h index 57dd930..15517cf 100644 --- a/algo/groestl/aes_ni/groestl256-intr-aes.h +++ b/algo/groestl/aes_ni/groestl256-intr-aes.h @@ -11,18 +11,6 @@ #include #include "hash-groestl256.h" -/* global constants */ -__m128i ROUND_CONST_Lx; -__m128i ROUND_CONST_L0[ROUNDS512]; -__m128i ROUND_CONST_L7[ROUNDS512]; -//__m128i ROUND_CONST_P[ROUNDS1024]; -//__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_1B; -__m128i ALL_FF; - - #define tos(a) #a #define tostr(a) tos(a) @@ -113,7 +101,7 @@ __m128i ALL_FF; \ /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ /* compute w_i : add y_{i+4} */\ - b1 = ALL_1B;\ + b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\ MUL2(a0, b0, b1);\ a0 = _mm_xor_si128(a0, TEMP0);\ MUL2(a1, b0, b1);\ @@ -153,24 +141,35 @@ __m128i ALL_FF; b1 = _mm_xor_si128(b1, a4);\ }/*MixBytes*/ -#define SET_CONSTANTS(){\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ - SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ - SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ - SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ - SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ - }\ - ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ -}while(0); \ + +static const uint64_t round_const_l0[] __attribute__ ((aligned (64))) = +{ + 0x7060504030201000, 0xffffffffffffffff, + 0x7161514131211101, 0xffffffffffffffff, + 0x7262524232221202, 0xffffffffffffffff, + 0x7363534333231303, 0xffffffffffffffff, + 0x7464544434241404, 0xffffffffffffffff, + 0x7565554535251505, 0xffffffffffffffff, + 0x7666564636261606, 0xffffffffffffffff, + 0x7767574737271707, 0xffffffffffffffff, + 0x7868584838281808, 0xffffffffffffffff, + 0x7969594939291909, 0xffffffffffffffff +}; + +static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) = +{ +0x0000000000000000, 0x8f9fafbfcfdfefff, +0x0000000000000000, 0x8e9eaebecedeeefe, +0x0000000000000000, 0x8d9dadbdcdddedfd, +0x0000000000000000, 0x8c9cacbcccdcecfc, +0x0000000000000000, 0x8b9babbbcbdbebfb, +0x0000000000000000, 0x8a9aaabacadaeafa, +0x0000000000000000, 0x8999a9b9c9d9e9f9, +0x0000000000000000, 0x8898a8b8c8d8e8f8, +0x0000000000000000, 0x8797a7b7c7d7e7f7, +0x0000000000000000, 0x8696a6b6c6d6e6f6 +}; + /* one round * i = round number @@ -179,34 +178,42 @@ __m128i ALL_FF; */ #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ /* AddRoundConstant */\ - b1 = ROUND_CONST_Lx;\ - a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ - a1 = _mm_xor_si128(a1, b1);\ - a2 = _mm_xor_si128(a2, b1);\ - a3 = _mm_xor_si128(a3, b1);\ - a4 = _mm_xor_si128(a4, b1);\ - a5 = _mm_xor_si128(a5, b1);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ + b1 = m128_const_64( 0xffffffffffffffff, 0 ); \ + a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \ + a1 = _mm_xor_si128( a1, b1 ); \ + a2 = _mm_xor_si128( a2, b1 ); \ + a3 = _mm_xor_si128( a3, b1 ); \ + a4 = _mm_xor_si128( a4, b1 ); \ + a5 = _mm_xor_si128( a5, b1 ); \ + a6 = _mm_xor_si128( a6, b1 ); \ + a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \ \ /* ShiftBytes + SubBytes (interleaved) */\ b0 = _mm_xor_si128(b0, b0);\ - a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ - a0 = _mm_aesenclast_si128(a0, b0);\ - a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ - a1 = _mm_aesenclast_si128(a1, b0);\ - a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ - a2 = _mm_aesenclast_si128(a2, b0);\ - a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ - a3 = _mm_aesenclast_si128(a3, b0);\ - a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ - a4 = _mm_aesenclast_si128(a4, b0);\ - a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ - a5 = _mm_aesenclast_si128(a5, b0);\ - a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ - a6 = _mm_aesenclast_si128(a6, b0);\ - a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ - a7 = _mm_aesenclast_si128(a7, b0);\ + a0 = _mm_shuffle_epi8( a0, m128_const_64( 0x03060a0d08020509, \ + 0x0c0f0104070b0e00 ) ); \ + a0 = _mm_aesenclast_si128( a0, b0 );\ + a1 = _mm_shuffle_epi8( a1, m128_const_64( 0x04070c0f0a03060b, \ + 0x0e090205000d0801 ) ); \ + a1 = _mm_aesenclast_si128( a1, b0 );\ + a2 = _mm_shuffle_epi8( a2, m128_const_64( 0x05000e090c04070d, \ + 0x080b0306010f0a02 ) ); \ + a2 = _mm_aesenclast_si128( a2, b0 );\ + a3 = _mm_shuffle_epi8( a3, m128_const_64( 0x0601080b0e05000f, \ + 0x0a0d040702090c03 ) ); \ + a3 = _mm_aesenclast_si128( a3, b0 );\ + a4 = _mm_shuffle_epi8( a4, m128_const_64( 0x0702090c0f060108, \ + 0x0b0e0500030a0d04 ) ); \ + a4 = _mm_aesenclast_si128( a4, b0 );\ + a5 = _mm_shuffle_epi8( a5, m128_const_64( 0x00030b0e0907020a, \ + 0x0d080601040c0f05 ) ); \ + a5 = _mm_aesenclast_si128( a5, b0 );\ + a6 = _mm_shuffle_epi8( a6, m128_const_64( 0x01040d080b00030c, \ + 0x0f0a0702050e0906 ) ); \ + a6 = _mm_aesenclast_si128( a6, b0 );\ + a7 = _mm_shuffle_epi8( a7, m128_const_64( 0x02050f0a0d01040e, \ + 0x090c000306080b07 ) ); \ + a7 = _mm_aesenclast_si128( a7, b0 );\ \ /* MixBytes */\ MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ @@ -235,7 +242,7 @@ __m128i ALL_FF; * clobbers: t0 */ #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - t0 = TRANSP_MASK;\ + t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \ \ i0 = _mm_shuffle_epi8(i0, t0);\ i1 = _mm_shuffle_epi8(i1, t0);\ diff --git a/algo/groestl/aes_ni/groestl256-intr-avx.h b/algo/groestl/aes_ni/groestl256-intr-avx.h deleted file mode 100644 index 3eb8397..0000000 --- a/algo/groestl/aes_ni/groestl256-intr-avx.h +++ /dev/null @@ -1,482 +0,0 @@ -/* groestl-intr-avx.h Aug 2011 - * - * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx - * instructions. - * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * This code is placed in the public domain - */ - -#include -#include -#include -#include "hash-groestl256.h" - -/* global constants */ -__m128i ROUND_CONST_Lx; -__m128i ROUND_CONST_L0[ROUNDS512]; -__m128i ROUND_CONST_L7[ROUNDS512]; -__m128i ROUND_CONST_P[ROUNDS1024]; -__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_FF; -//#if LENGTH <= 256 -__m128i ALL_1B; -//#else -//__m256d ALL_1B; -//#endif - -#define tos(a) #a -#define tostr(a) tos(a) - -#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos))) -#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos)) - -#define SET_CONSTANTS(){\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\ - SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\ - SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\ - SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\ - SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\ - SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\ - SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ - }\ - ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ -}while(0); - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b - * xmm[z] has to be zero */ -#define VMUL2(i, j, k, z){\ - j = _mm_cmpgt_epi8(z, i);\ - i = _mm_add_epi8(i, i);\ - j = _mm_and_si128(j, k);\ - i = _mm_xor_si128(i, j);\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\ - b0 = a2;\ - b1 = a3;\ - b2 = a4;\ - b3 = a5;\ - b4 = a6;\ - b5 = a7;\ - b6 = a0;\ - b7 = a1;\ - \ - /* t_i = a_i + a_{i+1} */\ - a0 = _mm_xor_si128(a0, a1);\ - a1 = _mm_xor_si128(a1, a2);\ - a2 = _mm_xor_si128(a2, a3);\ - a3 = _mm_xor_si128(a3, a4);\ - a4 = _mm_xor_si128(a4, a5);\ - a5 = _mm_xor_si128(a5, a6);\ - a6 = _mm_xor_si128(a6, a7);\ - a7 = _mm_xor_si128(a7, b6);\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - b0 = _mm_xor_si128(b0, a4);\ - b1 = _mm_xor_si128(b1, a5);\ - b2 = _mm_xor_si128(b2, a6);\ - b3 = _mm_xor_si128(b3, a7);\ - b4 = _mm_xor_si128(b4, a0);\ - b5 = _mm_xor_si128(b5, a1);\ - b6 = _mm_xor_si128(b6, a2);\ - b7 = _mm_xor_si128(b7, a3);\ - \ - b0 = _mm_xor_si128(b0, a6);\ - b1 = _mm_xor_si128(b1, a7);\ - b2 = _mm_xor_si128(b2, a0);\ - b3 = _mm_xor_si128(b3, a1);\ - b4 = _mm_xor_si128(b4, a2);\ - b5 = _mm_xor_si128(b5, a3);\ - b6 = _mm_xor_si128(b6, a4);\ - b7 = _mm_xor_si128(b7, a5);\ - \ - /* spill values y_4, y_5 to memory */\ - TEMP0 = b0;\ - TEMP1 = b1;\ - TEMP2 = b2;\ - \ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - b0 = a0;\ - b1 = a1;\ - TEMP3 = a2;\ - \ - /* compute x_i = t_i + t_{i+3} */\ - a0 = _mm_xor_si128(a0, a3);\ - a1 = _mm_xor_si128(a1, a4);\ - a2 = _mm_xor_si128(a2, a5);\ - a3 = _mm_xor_si128(a3, a6);\ - a4 = _mm_xor_si128(a4, a7);\ - a5 = _mm_xor_si128(a5, b0);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, TEMP3);\ - \ - /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - b1 = ALL_1B;\ - b2 = _mm_xor_si128(b2, b2);\ - VMUL2(a7, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a0, b0, b1, b2);\ - \ - /* compute w_i : add y_{i+4} */\ - a0 = _mm_xor_si128(a0, TEMP0);\ - a1 = _mm_xor_si128(a1, TEMP1);\ - a2 = _mm_xor_si128(a2, TEMP2);\ - a3 = _mm_xor_si128(a3, b3);\ - a4 = _mm_xor_si128(a4, b4);\ - a5 = _mm_xor_si128(a5, b5);\ - a6 = _mm_xor_si128(a6, b6);\ - a7 = _mm_xor_si128(a7, b7);\ - \ - /*compute v_i: double w_i */\ - VMUL2(a0, b0, b1, b2);\ - VMUL2(a1, b0, b1, b2);\ - VMUL2(a2, b0, b1, b2);\ - VMUL2(a3, b0, b1, b2);\ - VMUL2(a4, b0, b1, b2);\ - VMUL2(a5, b0, b1, b2);\ - VMUL2(a6, b0, b1, b2);\ - VMUL2(a7, b0, b1, b2);\ - \ - /* add to y_4 y_5 .. v3, v4, ... */\ - b0 = _mm_xor_si128(a3, TEMP0);\ - b1 = _mm_xor_si128(a4, TEMP1);\ - b2 = _mm_xor_si128(a5, TEMP2);\ - b3 = _mm_xor_si128(b3, a6);\ - b4 = _mm_xor_si128(b4, a7);\ - b5 = _mm_xor_si128(b5, a0);\ - b6 = _mm_xor_si128(b6, a1);\ - b7 = _mm_xor_si128(b7, a2);\ -}/*MixBytes*/ - -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* Add Round Constant */\ - b1 = ROUND_CONST_Lx;\ - a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ - a1 = _mm_xor_si128(a1, b1);\ - a2 = _mm_xor_si128(a2, b1);\ - a3 = _mm_xor_si128(a3, b1);\ - a4 = _mm_xor_si128(a4, b1);\ - a5 = _mm_xor_si128(a5, b1);\ - a6 = _mm_xor_si128(a6, b1);\ - a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ - \ - /* ShiftBytes + SubBytes (interleaved) */\ - b0 = _mm_xor_si128(b0, b0);\ - a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ - a0 = _mm_aesenclast_si128(a0, b0);\ - a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ - a1 = _mm_aesenclast_si128(a1, b0);\ - a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ - a2 = _mm_aesenclast_si128(a2, b0);\ - a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ - a3 = _mm_aesenclast_si128(a3, b0);\ - a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ - a4 = _mm_aesenclast_si128(a4, b0);\ - a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ - a5 = _mm_aesenclast_si128(a5, b0);\ - a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ - a6 = _mm_aesenclast_si128(a6, b0);\ - a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ - a7 = _mm_aesenclast_si128(a7, b0);\ - \ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ -} - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - t0 = TRANSP_MASK;\ - \ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ - \ - o1 = _mm_unpackhi_epi16(i0, i1);\ - i0 = _mm_unpacklo_epi16(i0, i1);\ - t0 = _mm_unpackhi_epi16(i2, i3);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - \ - i0 = _mm_shuffle_epi32(i0, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - t0 = _mm_shuffle_epi32(t0, 216);\ - \ - o2 = _mm_unpackhi_epi32(i0, i2);\ - o3 = _mm_unpackhi_epi32(o1, t0);\ - i0 = _mm_unpacklo_epi32(i0, i2);\ - o1 = _mm_unpacklo_epi32(o1, t0);\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - o1 = _mm_unpackhi_epi64(i0, i4);\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - o2 = _mm_unpacklo_epi64(i1, i5);\ - o3 = _mm_unpackhi_epi64(i1, i5);\ - o4 = _mm_unpacklo_epi64(i2, i6);\ - o5 = _mm_unpackhi_epi64(i2, i6);\ - o6 = _mm_unpacklo_epi64(i3, i7);\ - o7 = _mm_unpackhi_epi64(i3, i7);\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - o0 = _mm_unpackhi_epi64(i0, i1);\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - o1 = _mm_unpackhi_epi64(i2, i3);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - o2 = _mm_unpackhi_epi64(i4, i5);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - o3 = _mm_unpackhi_epi64(i6, i7);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - t0 = _mm_xor_si128(t0, t0);\ - i1 = _mm_unpackhi_epi64(i0, t0);\ - i0 = _mm_unpacklo_epi64(i0, t0);\ - i3 = _mm_unpackhi_epi64(i2, t0);\ - i2 = _mm_unpacklo_epi64(i2, t0);\ - i5 = _mm_unpackhi_epi64(i4, t0);\ - i4 = _mm_unpacklo_epi64(i4, t0);\ - i7 = _mm_unpackhi_epi64(i6, t0);\ - i6 = _mm_unpacklo_epi64(i6, t0);\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ -}/**/ - - -void INIT256(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7; - static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15; - - /* load IV into registers xmm12 - xmm15 */ - xmm12 = chaining[0]; - xmm13 = chaining[1]; - xmm14 = chaining[2]; - xmm15 = chaining[3]; - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* store transposed IV */ - chaining[0] = xmm12; - chaining[1] = xmm2; - chaining[2] = xmm6; - chaining[3] = xmm7; -} - -void TF512(u64* h, u64* m) -{ - __m128i* const chaining = (__m128i*) h; - __m128i* const message = (__m128i*) m; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP0; - static __m128i TEMP1; - static __m128i TEMP2; - static __m128i TEMP3; - -#ifdef IACA_TRACE - IACA_START; -#endif - - /* load message into registers xmm12 - xmm15 */ - xmm12 = message[0]; - xmm13 = message[1]; - xmm14 = message[2]; - xmm15 = message[3]; - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* load previous chaining value and xor message to CV to get input of P */ - /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - xmm8 = _mm_xor_si128(xmm12, chaining[0]); - xmm0 = _mm_xor_si128(xmm2, chaining[1]); - xmm4 = _mm_xor_si128(xmm6, chaining[2]); - xmm5 = _mm_xor_si128(xmm7, chaining[3]); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, xmm8); - xmm1 = _mm_xor_si128(xmm1, xmm10); - xmm2 = _mm_xor_si128(xmm2, xmm12); - xmm3 = _mm_xor_si128(xmm3, xmm14); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, chaining[0]); - xmm1 = _mm_xor_si128(xmm1, chaining[1]); - xmm2 = _mm_xor_si128(xmm2, chaining[2]); - xmm3 = _mm_xor_si128(xmm3, chaining[3]); - - /* store CV */ - chaining[0] = xmm0; - chaining[1] = xmm1; - chaining[2] = xmm2; - chaining[3] = xmm3; - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF512(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP0; - static __m128i TEMP1; - static __m128i TEMP2; - static __m128i TEMP3; - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - xmm8 = chaining[0]; - xmm10 = chaining[1]; - xmm12 = chaining[2]; - xmm14 = chaining[3]; - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm10 = _mm_xor_si128(xmm10, (chaining[1])); - xmm12 = _mm_xor_si128(xmm12, (chaining[2])); - xmm14 = _mm_xor_si128(xmm14, (chaining[3])); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); - - /* we only need to return the truncated half of the state */ - chaining[2] = xmm9; - chaining[3] = xmm11; -} - - diff --git a/algo/groestl/aes_ni/groestl256-intr-vperm.h b/algo/groestl/aes_ni/groestl256-intr-vperm.h deleted file mode 100644 index f6baa17..0000000 --- a/algo/groestl/aes_ni/groestl256-intr-vperm.h +++ /dev/null @@ -1,793 +0,0 @@ -/* groestl-intr-vperm.h Aug 2011 - * - * Groestl implementation with intrinsics using ssse3 instructions. - * Author: Günther A. Roland, Martin Schläffer - * - * Based on the vperm and aes_ni implementations of the hash function Groestl - * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ - * Institute of Applied Mathematics, Middle East Technical University, Turkey - * - * This code is placed in the public domain - */ - -#include -#include "hash-groestl256.h" - -/* global constants */ -__m128i ROUND_CONST_Lx; -__m128i ROUND_CONST_L0[ROUNDS512]; -__m128i ROUND_CONST_L7[ROUNDS512]; -__m128i ROUND_CONST_P[ROUNDS1024]; -__m128i ROUND_CONST_Q[ROUNDS1024]; -__m128i TRANSP_MASK; -__m128i SUBSH_MASK[8]; -__m128i ALL_0F; -__m128i ALL_15; -__m128i ALL_1B; -__m128i ALL_63; -__m128i ALL_FF; -__m128i VPERM_IPT[2]; -__m128i VPERM_OPT[2]; -__m128i VPERM_INV[2]; -__m128i VPERM_SB1[2]; -__m128i VPERM_SB2[2]; -__m128i VPERM_SB4[2]; -__m128i VPERM_SBO[2]; - - -#define tos(a) #a -#define tostr(a) tos(a) - -#define SET_SHARED_CONSTANTS(){\ - TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\ - ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\ - ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\ - VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\ - VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\ - VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\ - VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\ - VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\ - VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\ - VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\ - VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\ - VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\ - VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\ - VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\ - VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\ -}/**/ - -/* VPERM - * Transform w/o settings c* - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ - t0 = c0;\ - t1 = c0;\ - t0 = _mm_andnot_si128(t0, a0);\ - t1 = _mm_andnot_si128(t1, a1);\ - t0 = _mm_srli_epi32(t0, 4);\ - t1 = _mm_srli_epi32(t1, 4);\ - a0 = _mm_and_si128(a0, c0);\ - a1 = _mm_and_si128(a1, c0);\ - t2 = c2;\ - t3 = c2;\ - t2 = _mm_shuffle_epi8(t2, a0);\ - t3 = _mm_shuffle_epi8(t3, a1);\ - a0 = c1;\ - a1 = c1;\ - a0 = _mm_shuffle_epi8(a0, t0);\ - a1 = _mm_shuffle_epi8(a1, t1);\ - a0 = _mm_xor_si128(a0, t2);\ - a1 = _mm_xor_si128(a1, t3);\ -}/**/ - -#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ - c0 = ALL_0F;\ - c1 = ((__m128i*) table )[0];\ - c2 = ((__m128i*) table )[1];\ -}/**/ - -/* VPERM - * Transform - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Transform State - * inputs: - * a0-a3 = state - * table = transformation table to use - * t* = clobbers - * outputs: - * a0-a3 = transformed state - * */ -#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ - VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Add Constant to State - * inputs: - * a0-a7 = state - * constant = constant to add - * t0 = clobber - * outputs: - * a0-a7 = state + constant - * */ -#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ - t0 = constant;\ - a0 = _mm_xor_si128(a0, t0);\ - a1 = _mm_xor_si128(a1, t0);\ - a2 = _mm_xor_si128(a2, t0);\ - a3 = _mm_xor_si128(a3, t0);\ - a4 = _mm_xor_si128(a4, t0);\ - a5 = _mm_xor_si128(a5, t0);\ - a6 = _mm_xor_si128(a6, t0);\ - a7 = _mm_xor_si128(a7, t0);\ -}/**/ - -/* VPERM - * Set Substitute Core Constants - * */ -#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ - VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ -}/**/ - -/* VPERM - * Substitute Core - * first part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0 = 1 row - * t*, c* = clobbers - * outputs: - * b0a, b0b = inputs for lookup step - * */ -#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ - t0 = c0;\ - t0 = _mm_andnot_si128(t0, a0);\ - t0 = _mm_srli_epi32(t0, 4);\ - a0 = _mm_and_si128(a0, c0);\ - b0a = c1;\ - b0a = _mm_shuffle_epi8(b0a, a0);\ - a0 = _mm_xor_si128(a0, t0);\ - b0b = c2;\ - b0b = _mm_shuffle_epi8(b0b, t0);\ - b0b = _mm_xor_si128(b0b, b0a);\ - t1 = c2;\ - t1 = _mm_shuffle_epi8(t1, a0);\ - t1 = _mm_xor_si128(t1, b0a);\ - b0a = c2;\ - b0a = _mm_shuffle_epi8(b0a, b0b);\ - b0a = _mm_xor_si128(b0a, a0);\ - b0b = c2;\ - b0b = _mm_shuffle_epi8(b0b, t1);\ - b0b = _mm_xor_si128(b0b, t0);\ -}/**/ - -/* VPERM - * Lookup - * second part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0a, a0b = output of Substitution Core - * table = lookup table to use (*1 / *2 / *4) - * t0 = clobber - * outputs: - * b0 = output of sbox + multiplication - * */ -#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ - b0 = ((__m128i*) table )[0];\ - t0 = ((__m128i*) table )[1];\ - b0 = _mm_shuffle_epi8(b0, a0b);\ - t0 = _mm_shuffle_epi8(t0, a0a);\ - b0 = _mm_xor_si128(b0, t0);\ -}/**/ - -/* VPERM - * SubBytes and *2 / *4 - * this function is derived from: - * Constant-time SSSE3 AES core implementation - * by Mike Hamburg - * and - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0-a7 = state - * t*, c* = clobbers - * outputs: - * a0-a7 = state * 4 - * c2 = row0 * 2 -> b0 - * c1 = row7 * 2 -> b3 - * c0 = row7 * 1 -> b4 - * t2 = row4 * 1 -> b7 - * TEMP_MUL1 = row(i) * 1 - * TEMP_MUL2 = row(i) * 2 - * - * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ -#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ - /* set Constants */\ - VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ - /* row 1 */\ - VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[1] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[1] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ - /* --- */\ - /* row 2 */\ - VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[2] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[2] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ - /* --- */\ - /* row 3 */\ - VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[3] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[3] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ - /* --- */\ - /* row 5 */\ - VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[5] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[5] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ - /* --- */\ - /* row 6 */\ - VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[6] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[6] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ - /* --- */\ - /* row 7 */\ - VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - TEMP_MUL1[7] = t2;\ - VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ - VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ - /* --- */\ - /* row 4 */\ - VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - TEMP_MUL2[4] = t3;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ - /* --- */\ - /* row 0 */\ - VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ - TEMP_MUL2[0] = c2;\ - VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ - /* --- */\ -}/**/ - - -/* Optimized MixBytes - * inputs: - * a0-a7 = (row0-row7) * 4 - * b0 = row0 * 2 - * b3 = row7 * 2 - * b4 = row7 * 1 - * b7 = row4 * 1 - * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 - * output: b0-b7 - * */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* save one value */\ - TEMP_MUL4 = a3;\ - /* 1 */\ - b1 = a0;\ - b1 = _mm_xor_si128(b1, a5);\ - b1 = _mm_xor_si128(b1, b4); /* -> helper! */\ - b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\ - b2 = b1;\ - \ - /* 2 */\ - b5 = a1;\ - b5 = _mm_xor_si128(b5, a4);\ - b5 = _mm_xor_si128(b5, b7); /* -> helper! */\ - b5 = _mm_xor_si128(b5, b3); /* -> helper! */\ - b6 = b5;\ - \ - /* 4 */\ - b7 = _mm_xor_si128(b7, a6);\ - /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\ - b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\ - b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\ - b7 = _mm_xor_si128(b7, b3); /* -> helper! */\ - b2 = _mm_xor_si128(b2, b7);\ - \ - /* 3 */\ - b0 = _mm_xor_si128(b0, a7);\ - b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\ - b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\ - /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\ - b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\ - b3 = b0;\ - b1 = _mm_xor_si128(b1, b0);\ - b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\ - \ - /* 5 */\ - b4 = _mm_xor_si128(b4, a2);\ - /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\ - b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\ - b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\ - b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\ - b3 = _mm_xor_si128(b3, b4);\ - b6 = _mm_xor_si128(b6, b4);\ - \ - /* 6 */\ - a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\ - b4 = _mm_xor_si128(b4, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - b7 = _mm_xor_si128(b7, a3);\ - \ - /* 7 */\ - a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\ - a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\ - b2 = _mm_xor_si128(b2, a1);\ - b3 = _mm_xor_si128(b3, a1);\ - \ - /* 8 */\ - a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\ - a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\ - b6 = _mm_xor_si128(b6, a5);\ - b7 = _mm_xor_si128(b7, a5);\ - \ - /* 9 */\ - a3 = TEMP_MUL1[2];\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\ - b0 = _mm_xor_si128(b0, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - \ - /* 10 */\ - a1 = TEMP_MUL1[6];\ - a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\ - b1 = _mm_xor_si128(b1, a1);\ - b4 = _mm_xor_si128(b4, a1);\ - \ - /* 11 */\ - a5 = TEMP_MUL1[3];\ - a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\ - b1 = _mm_xor_si128(b1, a5);\ - b6 = _mm_xor_si128(b6, a5);\ - \ - /* 12 */\ - a3 = TEMP_MUL1[7];\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\ - b2 = _mm_xor_si128(b2, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - \ - /* 13 */\ - b0 = _mm_xor_si128(b0, (TEMP_MUL4));\ - b0 = _mm_xor_si128(b0, a4);\ - b1 = _mm_xor_si128(b1, a4);\ - b3 = _mm_xor_si128(b3, a6);\ - b4 = _mm_xor_si128(b4, a0);\ - b4 = _mm_xor_si128(b4, a7);\ - b5 = _mm_xor_si128(b5, a0);\ - b7 = _mm_xor_si128(b7, a2);\ -}/**/ - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\ - SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\ - SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\ - SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\ - SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\ - SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\ - SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\ - SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\ - for(i = 0; i < ROUNDS512; i++)\ - {\ - ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\ - }\ - ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\ -}/**/ - -/* vperm: - * transformation before rounds with ipt - * first round add transformed constant - * middle rounds: add constant XOR 0x15...15 - * last round: additionally add 0x15...15 after MB - * transformation after rounds with opt - */ -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant + ShiftBytes (interleaved) */\ - b1 = ROUND_CONST_Lx;\ - a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\ - a1 = _mm_xor_si128(a1, b1);\ - a2 = _mm_xor_si128(a2, b1);\ - a3 = _mm_xor_si128(a3, b1);\ - a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\ - a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\ - a4 = _mm_xor_si128(a4, b1);\ - a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\ - a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\ - a5 = _mm_xor_si128(a5, b1);\ - a6 = _mm_xor_si128(a6, b1);\ - a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\ - a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\ - a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\ - a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\ - a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\ - /* SubBytes + Multiplication by 2 and 4 */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -/* 10 rounds, P and Q in parallel */ -#define ROUNDS_P_Q(){\ - VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ - ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\ -} - - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - t0 = TRANSP_MASK;\ -\ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ -\ - o1 = i0;\ - t0 = i2;\ -\ - i0 = _mm_unpacklo_epi16(i0, i1);\ - o1 = _mm_unpackhi_epi16(o1, i1);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - t0 = _mm_unpackhi_epi16(t0, i3);\ -\ - i0 = _mm_shuffle_epi32(i0, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - t0 = _mm_shuffle_epi32(t0, 216);\ -\ - o2 = i0;\ - o3 = o1;\ -\ - i0 = _mm_unpacklo_epi32(i0, i2);\ - o1 = _mm_unpacklo_epi32(o1, t0);\ - o2 = _mm_unpackhi_epi32(o2, i2);\ - o3 = _mm_unpackhi_epi32(o3, t0);\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - o1 = i0;\ - o2 = i1;\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - o1 = _mm_unpackhi_epi64(o1, i4);\ - o3 = i1;\ - o4 = i2;\ - o2 = _mm_unpacklo_epi64(o2, i5);\ - o3 = _mm_unpackhi_epi64(o3, i5);\ - o5 = i2;\ - o6 = i3;\ - o4 = _mm_unpacklo_epi64(o4, i6);\ - o5 = _mm_unpackhi_epi64(o5, i6);\ - o7 = i3;\ - o6 = _mm_unpacklo_epi64(o6, i7);\ - o7 = _mm_unpackhi_epi64(o7, i7);\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - o0 = i0;\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - o0 = _mm_unpackhi_epi64(o0, i1);\ - o1 = i2;\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - o1 = _mm_unpackhi_epi64(o1, i3);\ - o2 = i4;\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - o2 = _mm_unpackhi_epi64(o2, i5);\ - o3 = i6;\ - i6 = _mm_unpacklo_epi64(i6, i7);\ - o3 = _mm_unpackhi_epi64(o3, i7);\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - t0 = _mm_xor_si128(t0, t0);\ - i1 = i0;\ - i3 = i2;\ - i5 = i4;\ - i7 = i6;\ - i0 = _mm_unpacklo_epi64(i0, t0);\ - i1 = _mm_unpackhi_epi64(i1, t0);\ - i2 = _mm_unpacklo_epi64(i2, t0);\ - i3 = _mm_unpackhi_epi64(i3, t0);\ - i4 = _mm_unpacklo_epi64(i4, t0);\ - i5 = _mm_unpackhi_epi64(i5, t0);\ - i6 = _mm_unpacklo_epi64(i6, t0);\ - i7 = _mm_unpackhi_epi64(i7, t0);\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - i6 = _mm_unpacklo_epi64(i6, i7);\ -}/**/ - - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - xmm0 = ROUND_CONST_L0[i];\ - xmm1 = ROUND_CONST_L7[i];\ - xmm2 = ROUND_CONST_L0[j];\ - xmm3 = ROUND_CONST_L7[j];\ - VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ - xmm1 = _mm_xor_si128(xmm1, (ALL_15));\ - xmm2 = _mm_xor_si128(xmm2, (ALL_15));\ - xmm3 = _mm_xor_si128(xmm3, (ALL_15));\ - ROUND_CONST_L0[i] = xmm0;\ - ROUND_CONST_L7[i] = xmm1;\ - ROUND_CONST_L0[j] = xmm2;\ - ROUND_CONST_L7[j] = xmm3;\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - xmm0 = ROUND_CONST_Lx;\ - VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm0 = _mm_xor_si128(xmm0, (ALL_15));\ - ROUND_CONST_Lx = xmm0;\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ -}/**/ - -void INIT256(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15; - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm12 - xmm15 */ - xmm12 = chaining[0]; - xmm13 = chaining[1]; - xmm14 = chaining[2]; - xmm15 = chaining[3]; - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* store transposed IV */ - chaining[0] = xmm12; - chaining[1] = xmm2; - chaining[2] = xmm6; - chaining[3] = xmm7; -} - -void TF512(u64* h, u64* m) -{ - __m128i* const chaining = (__m128i*) h; - __m128i* const message = (__m128i*) m; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - -#ifdef IACA_TRACE - IACA_START; -#endif - - /* load message into registers xmm12 - xmm15 */ - xmm12 = message[0]; - xmm13 = message[1]; - xmm14 = message[2]; - xmm15 = message[3]; - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - xmm8 = chaining[0]; - xmm0 = chaining[1]; - xmm4 = chaining[2]; - xmm5 = chaining[3]; - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - xmm8 = _mm_xor_si128(xmm8, xmm12); - xmm0 = _mm_xor_si128(xmm0, xmm2); - xmm4 = _mm_xor_si128(xmm4, xmm6); - xmm5 = _mm_xor_si128(xmm5, xmm7); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* compute the two permutations P and Q in parallel */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, xmm8); - xmm1 = _mm_xor_si128(xmm1, xmm10); - xmm2 = _mm_xor_si128(xmm2, xmm12); - xmm3 = _mm_xor_si128(xmm3, xmm14); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - xmm0 = _mm_xor_si128(xmm0, (chaining[0])); - xmm1 = _mm_xor_si128(xmm1, (chaining[1])); - xmm2 = _mm_xor_si128(xmm2, (chaining[2])); - xmm3 = _mm_xor_si128(xmm3, (chaining[3])); - - /* store CV */ - chaining[0] = xmm0; - chaining[1] = xmm1; - chaining[2] = xmm2; - chaining[3] = xmm3; - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF512(u64* h) -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - xmm8 = chaining[0]; - xmm10 = chaining[1]; - xmm12 = chaining[2]; - xmm14 = chaining[3]; - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - ROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm10 = _mm_xor_si128(xmm10, (chaining[1])); - xmm12 = _mm_xor_si128(xmm12, (chaining[2])); - xmm14 = _mm_xor_si128(xmm14, (chaining[3])); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0); - VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); - - /* we only need to return the truncated half of the state */ - chaining[2] = xmm9; - chaining[3] = xmm11; - - return; -}//OF512() - - - diff --git a/algo/groestl/aes_ni/hash-groestl.c b/algo/groestl/aes_ni/hash-groestl.c index cf680e4..d26ef27 100644 --- a/algo/groestl/aes_ni/hash-groestl.c +++ b/algo/groestl/aes_ni/hash-groestl.c @@ -16,48 +16,13 @@ #ifdef __AES__ -#include "groestl-version.h" - -#ifdef TASM - #ifdef VAES - #include "groestl-asm-aes.h" - #else - #ifdef VAVX - #include "groestl-asm-avx.h" - #else - #ifdef VVPERM - #include "groestl-asm-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif -#else - #ifdef TINTR - #ifdef VAES - #include "groestl-intr-aes.h" - #else - #ifdef VAVX - #include "groestl-intr-avx.h" - #else - #ifdef VVPERM - #include "groestl-intr-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif - #else - #error NO TYPE SPECIFIED (-DT[ASM/INTR]) - #endif -#endif +#include "groestl-intr-aes.h" HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen ) { int i; ctx->hashlen = hashlen; - SET_CONSTANTS(); if (ctx->chaining == NULL || ctx->buffer == NULL) return FAIL_GR; @@ -70,8 +35,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen ) // The only non-zero in the IV is len. It can be hard coded. ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); -// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); -// INIT(ctx->chaining); ctx->buf_ptr = 0; ctx->rem_ptr = 0; @@ -92,8 +55,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx ) ctx->buffer[i] = _mm_setzero_si128(); } ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); -// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH); -// INIT(ctx->chaining); ctx->buf_ptr = 0; ctx->rem_ptr = 0; @@ -109,7 +70,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx ) // 5. Midstate will work at reduced impact than full hash, if total hash // (midstate + tail) is less than 1 block. // This, unfortunately, is the case with all current users. -// 6. the morefull blocks the bigger the gain +// 6. the more full blocks the bigger the gain // use only for midstate precalc HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input, @@ -143,12 +104,11 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input, // deprecated do not use HashReturn_gr final_groestl( hashState_groestl* ctx, void* output ) { - const int len = (int)ctx->databitlen / 128; // bits to __m128i - const int blocks = ctx->blk_count + 1; // adjust for final block - - const int rem_ptr = ctx->rem_ptr; // end of data start of padding - const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i - const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer + const int len = (int)ctx->databitlen / 128; // bits to __m128i + const uint64_t blocks = ctx->blk_count + 1; // adjust for final block + const int rem_ptr = ctx->rem_ptr; // end of data start of padding + const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i + const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer int i; // first pad byte = 0x80, last pad byte = block count @@ -157,21 +117,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output ) if ( rem_ptr == len - 1 ) { // only 128 bits left in buffer, all padding at once - ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 ); } else { // add first padding - ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 ); // add zero padding for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = _mm_setzero_si128(); // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, - 0, 0 ,0,0, 0,0,0,0 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); } // digest final padding block and do output transform @@ -189,21 +146,20 @@ int groestl512_full( hashState_groestl* ctx, void* output, const void* input, uint64_t databitlen ) { - int i; - - ctx->hashlen = 64; - SET_CONSTANTS(); - - for ( i = 0; i < SIZE512; i++ ) - { - ctx->chaining[i] = _mm_setzero_si128(); - ctx->buffer[i] = _mm_setzero_si128(); - } - ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); - ctx->buf_ptr = 0; - ctx->rem_ptr = 0; + int i; + ctx->hashlen = 64; + for ( i = 0; i < SIZE512; i++ ) + { + ctx->chaining[i] = _mm_setzero_si128(); + ctx->buffer[i] = _mm_setzero_si128(); + } + ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 ); + ctx->buf_ptr = 0; + ctx->rem_ptr = 0; + // --- update --- + const int len = (int)databitlen / 128; const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; @@ -211,8 +167,6 @@ int groestl512_full( hashState_groestl* ctx, void* output, uint64_t blocks = len / SIZE512; __m128i* in = (__m128i*)input; - // --- update --- - // digest any full blocks, process directly from input for ( i = 0; i < blocks; i++ ) TF1024( ctx->chaining, &in[ i * SIZE512 ] ); @@ -231,26 +185,22 @@ int groestl512_full( hashState_groestl* ctx, void* output, if ( i == len -1 ) { // only 128 bits left in buffer, all padding at once - ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 ); } else { // add first padding - ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[i] = m128_const_64( 0, 0x80 ); // add zero padding for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = _mm_setzero_si128(); // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, - 0, 0 ,0,0, 0,0,0,0 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); } // digest final padding block and do output transform TF1024( ctx->chaining, ctx->buffer ); - OF1024( ctx->chaining ); // store hash result in output @@ -268,7 +218,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; - int blocks = len / SIZE512; + uint64_t blocks = len / SIZE512; __m128i* in = (__m128i*)input; int i; @@ -292,26 +242,22 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, if ( i == len -1 ) { // only 128 bits left in buffer, all padding at once - ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 ); } else { // add first padding - ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, - 0,0,0,0, 0,0,0,0x80 ); + ctx->buffer[i] = m128_const_64( 0, 0x80 ); // add zero padding for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = _mm_setzero_si128(); // add length padding, second last byte is zero unless blocks > 255 - ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, - 0, 0 ,0,0, 0,0,0,0 ); + ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); } // digest final padding block and do output transform TF1024( ctx->chaining, ctx->buffer ); - OF1024( ctx->chaining ); // store hash result in output diff --git a/algo/groestl/aes_ni/hash-groestl256.c b/algo/groestl/aes_ni/hash-groestl256.c index ac6e5f5..34a37b1 100644 --- a/algo/groestl/aes_ni/hash-groestl256.c +++ b/algo/groestl/aes_ni/hash-groestl256.c @@ -13,41 +13,7 @@ #ifdef __AES__ -#include "groestl-version.h" - -#ifdef TASM - #ifdef VAES - #include "groestl256-asm-aes.h" - #else - #ifdef VAVX - #include "groestl256-asm-avx.h" - #else - #ifdef VVPERM - #include "groestl256-asm-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif -#else - #ifdef TINTR - #ifdef VAES - #include "groestl256-intr-aes.h" - #else - #ifdef VAVX - #include "groestl256-intr-avx.h" - #else - #ifdef VVPERM - #include "groestl256-intr-vperm.h" - #else - #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM]) - #endif - #endif - #endif - #else - #error NO TYPE SPECIFIED (-DT[ASM/INTR]) - #endif -#endif +#include "groestl256-intr-aes.h" /* initialise context */ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen ) @@ -55,7 +21,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen ) int i; ctx->hashlen = hashlen; - SET_CONSTANTS(); if (ctx->chaining == NULL || ctx->buffer == NULL) return FAIL_GR; diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index 0193210..3f4b671 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -53,7 +53,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, n += 8; } while ( (n < max_nonce-8) && !work_restart[thr_id].restart); - + pdata[19] = n; *hashes_done = n - first_nonce + 1; return 0; } @@ -104,7 +104,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, m256_const1_64( 0x0000000400000000 ) ); n += 4; } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); - + pdata[19] = n; *hashes_done = n - first_nonce + 1; return 0; } diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index 568a5da..282ae91 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -74,7 +74,7 @@ void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) bool register_sha3d_algo( algo_gate_t* gate ) { hard_coded_eb = 6; - opt_extranonce = false; +// opt_extranonce = false; gate->optimizations = AVX2_OPT | AVX512_OPT; gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root; #if defined (KECCAK_8WAY) diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c index dfd4320..41259e8 100644 --- a/algo/keccak/sha3d-4way.c +++ b/algo/keccak/sha3d-4way.c @@ -46,7 +46,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, sha3d_hash_8way( hash, vdata ); for ( int lane = 0; lane < 8; lane++ ) - if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) ) { extr_lane_8x64( lane_hash, hash, lane, 256 ); if ( valid_hash( lane_hash, ptarget ) ) @@ -59,8 +59,8 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, m512_const1_64( 0x0000000800000000 ) ); n += 8; - } while ( (n < last_nonce) && !work_restart[thr_id].restart); - + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } @@ -105,7 +105,7 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, sha3d_hash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) + if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); if ( valid_hash( lane_hash, ptarget ) ) @@ -117,8 +117,8 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, *noncev = _mm256_add_epi32( *noncev, m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( (n < last_nonce) && !work_restart[thr_id].restart); - + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index 09fbe13..780e56d 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -344,17 +344,12 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, // 16 byte partial block exists for 80 byte len if ( state->rembytes ) - { - // padding of partial block - rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ), + // padding of partial block + rnd512( state, m128_const_64( 0, 0x80000000 ), mm128_bswap_32( cast_m128i( data ) ) ); - } else - { - // empty pad block - rnd512( state, _mm_setzero_si128(), - _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) ); - } + // empty pad block + rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) ); finalization512( state, (uint32*) output ); if ( state->hashbitlen > 512 ) @@ -363,6 +358,56 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, return SUCCESS; } + +int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, + const BitSequence* data, size_t inlen ) +{ +// Optimized for integrals of 16 bytes, good for 64 and 80 byte len + int i; + state->hashbitlen = hashbitlen; + /* set the lower 32 bits to '1' */ + MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); + /* set all bits to '1' */ + ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); + /* set the 32-bit round constant values to the 128-bit data field */ + for ( i=0; i<32; i++ ) + CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] ); + for ( i=0; i<10; i++ ) + state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] ); + memset(state->buffer, 0, sizeof state->buffer ); + + // update + + int blocks = (int)( inlen / 32 ); + state->rembytes = inlen % 32; + + // full blocks + for ( i = 0; i < blocks; i++ ) + { + rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ), + mm128_bswap_32( casti_m128i( data, 0 ) ) ); + data += MSG_BLOCK_BYTE_LEN; + } + + // final + + // 16 byte partial block exists for 80 byte len + if ( state->rembytes ) + // padding of partial block + rnd512( state, m128_const_64( 0, 0x80000000 ), + mm128_bswap_32( cast_m128i( data ) ) ); + else + // empty pad block + rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) ); + + finalization512( state, (uint32*) output ); + if ( state->hashbitlen > 512 ) + finalization512( state, (uint32*)( output+128 ) ); + + return SUCCESS; +} + + /***************************************************/ /* Round function */ /* state: hash context */ diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h index d21b34c..5d0cb75 100644 --- a/algo/luffa/luffa_for_sse2.h +++ b/algo/luffa/luffa_for_sse2.h @@ -65,5 +65,6 @@ HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval ); HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, const BitSequence* data, size_t inlen ); - +int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, + const BitSequence* data, size_t inlen ); diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index e29419a..203ac89 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -280,14 +280,15 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, allium_16way_hash( hash, vdata ); for ( int lane = 0; lane < 16; lane++ ) - if unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) + if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) ) { pdata[19] = bswap_32( n + lane ); submit_lane_solution( work, hash+(lane<<3), mythr, lane ); } *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) ); n += 16; - } while ( (n < last_nonce) && !work_restart[thr_id].restart); + } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) ); + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } @@ -318,7 +319,6 @@ void allium_8way_hash( void *hash, const void *input ) { uint64_t vhashA[4*8] __attribute__ ((aligned (64))); uint64_t vhashB[4*8] __attribute__ ((aligned (64))); -// uint64_t hash[4*8] __attribute__ ((aligned (64))); uint64_t *hash0 = (uint64_t*)hash; uint64_t *hash1 = (uint64_t*)hash+ 4; uint64_t *hash2 = (uint64_t*)hash+ 8; @@ -443,7 +443,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, for ( int lane = 0; lane < 8; lane++ ) { const uint64_t *lane_hash = hash + (lane<<2); - if unlikely( valid_hash( lane_hash, ptarget ) && !bench ) + if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) ) { pdata[19] = bswap_32( n + lane ); submit_lane_solution( work, lane_hash, mythr, lane ); @@ -451,7 +451,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, } n += 8; *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) ); - } while likely( (n <= last_nonce) && !work_restart[thr_id].restart ); + } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) ); pdata[19] = n; *hashes_done = n - first_nonce; return 0; diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index b6b90fe..16eb23c 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -194,7 +194,7 @@ bool register_allium_algo( algo_gate_t* gate ) ///////////////////////////////////////// -bool phi2_has_roots; +bool phi2_has_roots = false; bool phi2_use_roots = false; int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; } diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h index 28811a6..89ae6da 100644 --- a/algo/lyra2/lyra2-gate.h +++ b/algo/lyra2/lyra2-gate.h @@ -189,7 +189,7 @@ bool init_allium_ctx(); // #define PHI2_4WAY #endif -bool phi2_has_roots; +extern bool phi2_has_roots; bool register_phi2_algo( algo_gate_t* gate ); #if defined(PHI2_4WAY) diff --git a/algo/simd/nist.c b/algo/simd/nist.c index 17b86a6..fbd4e71 100644 --- a/algo/simd/nist.c +++ b/algo/simd/nist.c @@ -360,18 +360,116 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval, return SUCCESS; } +int simd_full( hashState_sd *state, BitSequence *hashval, + const BitSequence *data, DataLength databitlen ) +{ + -/*HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, - BitSequence *hashval) { - hashState_sd s; - HashReturn r; - r = Init(&s, hashbitlen); - if (r != SUCCESS) - return r; - r = Update(&s, data, databitlen); - if (r != SUCCESS) - return r; - r = Final(&s, hashval); - return r; + InitIV( state, 512, IV_512 ); + + int current, i; + unsigned int bs = state->blocksize; + static int align = -1; + BitSequence out[64]; + int isshort = 1; + u64 l; + + if (align == -1) + align = RequiredAlignment(); + +#ifdef HAS_64 + current = state->count & (bs - 1); +#else + current = state->count_low & (bs - 1); +#endif + + if ( current & 7 ) + { + // The number of hashed bits is not a multiple of 8. + // Very painfull to implement and not required by the NIST API. + return FAIL; + } + + while ( databitlen > 0 ) + { + if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_Compress(state, data, 0); + databitlen -= bs; + data += bs/8; + IncreaseCounter(state, bs); + } + else + { + // Copy a chunk of data to the buffer + unsigned int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer+current/8, data, (databitlen+7)/8 ); + IncreaseCounter( state, databitlen ); + break; + } + else + { + memcpy( state->buffer+current/8, data, len/8 ); + IncreaseCounter( state,len ); + databitlen -= len; + data += len/8; + current = 0; + SIMD_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + // We first need to zero out the end of the buffer. + if ( current & 7 ) + { + BitSequence mask = 0xff >> ( current & 7 ); + state->buffer[current/8] &= ~mask; + } + current = ( current+7 ) / 8; + memset( state->buffer+current, 0, state->blocksize/8 - current ); + SIMD_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, state->blocksize / 8 ); + l = state->count; + for ( i=0; i<8; i++ ) + { + state->buffer[i] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_Compress( state, state->buffer, isshort ); + + // Decode the 32-bit words into a BitSequence + for ( i=0; i < 2*state->n_feistels; i++ ) + { + u32 x = state->A[i]; + out[4*i ] = x & 0xff; + x >>= 8; + out[4*i+1] = x & 0xff; + x >>= 8; + out[4*i+2] = x & 0xff; + x >>= 8; + out[4*i+3] = x & 0xff; + } + + memcpy( hashval, out, state->hashbitlen / 8 ); + if ( state->hashbitlen % 8 ) + { + BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) ); + hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask; + } + return SUCCESS; } -*/ + diff --git a/algo/simd/nist.h b/algo/simd/nist.h index f4b017d..b4737ff 100644 --- a/algo/simd/nist.h +++ b/algo/simd/nist.h @@ -47,8 +47,8 @@ HashReturn final_sd(hashState_sd *state, BitSequence *hashval); HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval, const BitSequence *data, DataLength databitlen ); -//HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, -// BitSequence *hashval); +int simd_full( hashState_sd *state, BitSequence *hashval, + const BitSequence *data, DataLength databitlen ); /* * Internal API diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index 528f66f..142fb74 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -727,7 +727,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data, { memcpy_256( buf + (ptr>>3), vdata, len>>3 ); sc->ptr = ptr + len; - return; + if ( ptr < buf_size ) return; } READ_STATE_BIG( sc ); @@ -745,6 +745,8 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data, clen = buf_size - ptr; if ( clen > len ) clen = len; + len -= clen; + if ( len == 0 ) break; memcpy_256( buf + (ptr>>3), vdata, clen>>3 ); ptr += clen; vdata += (clen>>3); @@ -769,9 +771,12 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n, READ_STATE_BIG(sc); - memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); - et = 352 + ((bcount == 0) << 7); - UBI_BIG_4WAY( et, ptr ); + if ( ptr ) + { + memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 ); + et = 352 + ((bcount == 0) << 7); + UBI_BIG_4WAY( et, ptr ); + } memset_zero_256( buf, buf_size >> 3 ); bcount = 0; diff --git a/algo/x11/tribus-4way.c b/algo/x11/tribus-4way.c index 7e56268..0092763 100644 --- a/algo/x11/tribus-4way.c +++ b/algo/x11/tribus-4way.c @@ -17,8 +17,6 @@ static __thread jh512_8way_context ctx_mid; void tribus_hash_8way( void *state, const void *input ) { uint64_t vhash[8*8] __attribute__ ((aligned (128))); - uint64_t vhashA[4*8] __attribute__ ((aligned (64))); - uint64_t vhashB[4*8] __attribute__ ((aligned (64))); uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -44,6 +42,8 @@ void tribus_hash_8way( void *state, const void *input ) keccak512_8way_close( &ctx_keccak, vhash ); #if defined(__VAES__) + uint64_t vhashA[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*4] __attribute__ ((aligned (64))); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); diff --git a/algo/x16/hex.c b/algo/x16/hex.c index 631e428..bd2df93 100644 --- a/algo/x16/hex.c +++ b/algo/x16/hex.c @@ -76,10 +76,13 @@ union _hex_context_overlay }; typedef union _hex_context_overlay hex_context_overlay; +static __thread hex_context_overlay hex_ctx; + void hex_hash( void* output, const void* input ) { uint32_t _ALIGN(128) hash[16]; hex_context_overlay ctx; + memcpy( &ctx, &hex_ctx, sizeof(ctx) ); void *in = (void*) input; int size = 80; /* @@ -109,23 +112,21 @@ void hex_hash( void* output, const void* input ) break; case GROESTL: #if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)in, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 ); #else sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, in, size ); sph_groestl512_close(&ctx.groestl, hash); #endif break; - case SKEIN: - sph_skein512_init( &ctx.skein ); - sph_skein512( &ctx.skein, in, size ); - sph_skein512_close( &ctx.skein, hash ); - break; case JH: - sph_jh512_init( &ctx.jh ); - sph_jh512(&ctx.jh, in, size ); + if ( i == 0 ) + sph_jh512(&ctx.jh, in+64, 16 ); + else + { + sph_jh512_init( &ctx.jh ); + sph_jh512(&ctx.jh, in, size ); + } sph_jh512_close(&ctx.jh, hash ); break; case KECCAK: @@ -133,15 +134,37 @@ void hex_hash( void* output, const void* input ) sph_keccak512( &ctx.keccak, in, size ); sph_keccak512_close( &ctx.keccak, hash ); break; - case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)in, size ); + case SKEIN: + if ( i == 0 ) + sph_skein512(&ctx.skein, in+64, 16 ); + else + { + sph_skein512_init( &ctx.skein ); + sph_skein512( &ctx.skein, in, size ); + } + sph_skein512_close( &ctx.skein, hash ); break; + case LUFFA: + if ( i == 0 ) + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)in+64, 16 ); + else + { + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)in, size ); + } + break; case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)in, size ); + if ( i == 0 ) + cubehashUpdateDigest( &ctx.cube, (byte*)hash, + (const byte*)in+64, 16 ); + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash, + (const byte*)in, size ); + } break; case SHAVITE: sph_shavite512_init( &ctx.shavite ); @@ -155,9 +178,8 @@ void hex_hash( void* output, const void* input ) break; case ECHO: #if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence*)in, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash, 512, + (const BitSequence *)in, size ); #else sph_echo512_init( &ctx.echo ); sph_echo512( &ctx.echo, in, size ); @@ -165,9 +187,14 @@ void hex_hash( void* output, const void* input ) #endif break; case HAMSI: - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512( &ctx.hamsi, in, size ); - sph_hamsi512_close( &ctx.hamsi, hash ); + if ( i == 0 ) + sph_hamsi512( &ctx.hamsi, in+64, 16 ); + else + { + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, in, size ); + } + sph_hamsi512_close( &ctx.hamsi, hash ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -175,14 +202,24 @@ void hex_hash( void* output, const void* input ) sph_fugue512_close( &ctx.fugue, hash ); break; case SHABAL: - sph_shabal512_init( &ctx.shabal ); - sph_shabal512( &ctx.shabal, in, size ); - sph_shabal512_close( &ctx.shabal, hash ); + if ( i == 0 ) + sph_shabal512( &ctx.shabal, in+64, 16 ); + else + { + sph_shabal512_init( &ctx.shabal ); + sph_shabal512( &ctx.shabal, in, size ); + } + sph_shabal512_close( &ctx.shabal, hash ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in, size ); - sph_whirlpool_close( &ctx.whirlpool, hash ); + if ( i == 0 ) + sph_whirlpool( &ctx.whirlpool, in+64, 16 ); + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in, size ); + } + sph_whirlpool_close( &ctx.whirlpool, hash ); break; case SHA_512: SHA512_Init( &ctx.sha512 ); @@ -201,47 +238,77 @@ int scanhash_hex( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t _ALIGN(128) hash32[8]; - uint32_t _ALIGN(128) endiandata[20]; + uint32_t _ALIGN(128) edata[20]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; - int thr_id = mythr->id; // thr_id arg is deprecated + const uint32_t last_nonce = max_nonce - 4; + const int thr_id = mythr->id; uint32_t nonce = first_nonce; volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; + if ( bench ) ptarget[7] = 0x0cff; - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); + casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); + casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); + casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); + casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); + casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); uint32_t ntime = swab32(pdata[17]); if ( s_ntime != ntime ) { - hex_getAlgoString( (const uint32_t*) (&endiandata[1]), hashOrder ); + hex_getAlgoString( (const uint32_t*) (&edata[1]), hashOrder ); s_ntime = ntime; if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime ); } - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + sph_jh512_init( &hex_ctx.jh ); + sph_jh512( &hex_ctx.jh, edata, 64 ); + break; + case SKEIN: + sph_skein512_init( &hex_ctx.skein ); + sph_skein512( &hex_ctx.skein, edata, 64 ); + break; + case CUBEHASH: + cubehashInit( &hex_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &hex_ctx.cube, (const byte*)edata, 64 ); + break; + case HAMSI: + sph_hamsi512_init( &hex_ctx.hamsi ); + sph_hamsi512( &hex_ctx.hamsi, edata, 64 ); + break; + case SHABAL: + sph_shabal512_init( &hex_ctx.shabal ); + sph_shabal512( &hex_ctx.shabal, edata, 64 ); + break; + case WHIRLPOOL: + sph_whirlpool_init( &hex_ctx.whirlpool ); + sph_whirlpool( &hex_ctx.whirlpool, edata, 64 ); + break; + } + do { - be32enc( &endiandata[19], nonce ); - hex_hash( hash32, endiandata ); + edata[19] = nonce; + hex_hash( hash32, edata ); - if ( hash32[7] <= Htarg ) - if (fulltest( hash32, ptarget ) && !opt_benchmark ) + if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) ) { - pdata[19] = nonce; + be32enc( &pdata[19], nonce ); submit_solution( work, hash32, mythr ); } nonce++; - } while ( nonce < max_nonce && !(*restart) ); + } while ( nonce < last_nonce && !(*restart) ); pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } + diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 73f15fd..f31820b 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -17,6 +17,7 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa-hash-2way.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cube-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/simd-hash-2way.h" @@ -32,11 +33,11 @@ #include "algo/echo/echo-hash-4way.h" #endif +#if defined (X16R_8WAY) + static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; -#if defined (X16R_8WAY) - union _x16r_8way_context_overlay { blake512_8way_context blake; @@ -45,7 +46,8 @@ union _x16r_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cube_4way_context cube; + cubehashParam cube; +// cube_4way_context cube; simd_4way_context simd; hamsi512_8way_context hamsi; sph_fugue512_context fugue; @@ -65,19 +67,21 @@ union _x16r_8way_context_overlay typedef union _x16r_8way_context_overlay x16r_8way_context_overlay; +static __thread x16r_8way_context_overlay x16r_ctx; void x16r_8way_hash( void* output, const void* input ) { - uint32_t vhash[24*8] __attribute__ ((aligned (128))); - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t hash4[24] __attribute__ ((aligned (64))); - uint32_t hash5[24] __attribute__ ((aligned (64))); - uint32_t hash6[24] __attribute__ ((aligned (64))); - uint32_t hash7[24] __attribute__ ((aligned (64))); + uint32_t vhash[20*8] __attribute__ ((aligned (128))); + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t hash4[20] __attribute__ ((aligned (64))); + uint32_t hash5[20] __attribute__ ((aligned (64))); + uint32_t hash6[20] __attribute__ ((aligned (64))); + uint32_t hash7[20] __attribute__ ((aligned (64))); x16r_8way_context_overlay ctx; + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -143,28 +147,14 @@ void x16r_8way_hash( void* output, const void* input ) groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 ); #endif break; - case SKEIN: - skein512_8way_init( &ctx.skein ); - if ( i == 0 ) - skein512_8way_update( &ctx.skein, input, size ); - else - { - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - skein512_8way_update( &ctx.skein, vhash, size ); - } - skein512_8way_close( &ctx.skein, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); - break; case JH: - jh512_8way_init( &ctx.jh ); if ( i == 0 ) - jh512_8way_update( &ctx.jh, input, size ); + jh512_8way_update( &ctx.jh, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); + jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, size ); } jh512_8way_close( &ctx.jh, vhash ); @@ -185,21 +175,97 @@ void x16r_8way_hash( void* output, const void* input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; + case SKEIN: + if ( i == 0 ) + skein512_8way_update( &ctx.skein, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, size ); + } + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; case LUFFA: - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + if ( i == 0 ) + { + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + } + else + { + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + } break; case CUBEHASH: - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - cube_4way_full( &ctx.cube, vhash, 512, vhash, size ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)in3 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash4, + (const byte*)in4 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash5, + (const byte*)in5 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash6, + (const byte*)in6 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash7, + (const byte*)in7 + 64, 16 ); + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash4, + (const byte*)in4, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash5, + (const byte*)in5, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash6, + (const byte*)in6, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash7, + (const byte*)in7, size ); + } break; case SHAVITE: #if defined(__VAES__) @@ -272,13 +338,17 @@ void x16r_8way_hash( void* output, const void* input ) #endif break; case HAMSI: - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + if ( i == 0 ) + hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, size ); - hamsi512_8way_close( &ctx.hamsi, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case FUGUE: @@ -309,38 +379,72 @@ void x16r_8way_hash( void* output, const void* input ) break; case SHABAL: intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, size ); + size<<3 ); + if ( i == 0 ) + shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 ); + else + { + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, size ); + } shabal512_8way_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in4, size ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in5, size ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in6, size ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in7, size ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in4, size ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in5, size ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in6, size ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in7, size ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } break; case SHA_512: sha512_8way_init( &ctx.sha512 ); @@ -355,7 +459,7 @@ void x16r_8way_hash( void* output, const void* input ) sha512_8way_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - break; + break; } size = 64; } @@ -373,23 +477,22 @@ void x16r_8way_hash( void* output, const void* input ) int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t hash[8*16] __attribute__ ((aligned (128))); - uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t vdata2[20*8] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t bedata1[2] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 8; uint32_t n = first_nonce; __m512i *noncev = (__m512i*)vdata + 9; // aligned - int thr_id = mythr->id; + const int thr_id = mythr->id; volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - mm512_bswap32_intrlv80_8x64( vdata, pdata ); + if ( bench ) ptarget[7] = 0x0cff; bedata1[0] = bswap_32( pdata[1] ); bedata1[1] = bswap_32( pdata[2] ); @@ -402,32 +505,84 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime ); } + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + jh512_8way_init( &x16r_ctx.jh ); + jh512_8way_update( &x16r_ctx.jh, vdata, 64 ); + break; + case SKEIN: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + skein512_8way_init( &x16r_ctx.skein ); + skein512_8way_update( &x16r_ctx.skein, vdata, 64 ); + break; + case LUFFA: + mm128_bswap32_80( edata, pdata ); + intrlv_4x128( vdata2, edata, edata, edata, edata, 640 ); + luffa_4way_init( &x16r_ctx.luffa, 512 ); + luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 ); + rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + hamsi512_8way_init( &x16r_ctx.hamsi ); + hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm256_bswap32_intrlv80_8x32( vdata2, pdata ); + shabal512_8way_init( &x16r_ctx.shabal ); + shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 ); + rintrlv_8x32_8x64( vdata, vdata2, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16r_ctx.whirlpool ); + sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + default: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + } + + *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( + n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x16r_8way_hash( hash, vdata ); - pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) - if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); n += 8; } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } - #elif defined (X16R_4WAY) +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; + union _x16r_4way_context_overlay { blake512_4way_context blake; @@ -438,6 +593,7 @@ union _x16r_4way_context_overlay jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; + hashState_luffa luffa1; cubehashParam cube; sph_shavite512_context shavite; simd_2way_context simd; @@ -449,14 +605,17 @@ union _x16r_4way_context_overlay } __attribute__ ((aligned (64))); typedef union _x16r_4way_context_overlay x16r_4way_context_overlay; +static __thread x16r_4way_context_overlay x16r_ctx; + void x16r_4way_hash( void* output, const void* input ) { - uint32_t vhash[24*4] __attribute__ ((aligned (128))); - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); + uint32_t vhash[20*4] __attribute__ ((aligned (128))); + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); x16r_4way_context_overlay ctx; + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -500,25 +659,13 @@ void x16r_4way_hash( void* output, const void* input ) groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); break; - case SKEIN: - skein512_4way_init( &ctx.skein ); - if ( i == 0 ) - skein512_4way_update( &ctx.skein, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way_update( &ctx.skein, vhash, size ); - } - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); - break; case JH: - jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way_update( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input + (64<<2), 16 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); @@ -536,27 +683,68 @@ void x16r_4way_hash( void* output, const void* input ) keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; + case SKEIN: + if ( i == 0 ) + skein512_4way_update( &ctx.skein, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + skein512_4way_init( &ctx.skein ); + skein512_4way_update( &ctx.skein, vhash, size ); + } + skein512_4way_close( &ctx.skein, vhash ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + break; case LUFFA: - intrlv_2x128( vhash, in0, in1, size<<3 ); - luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128_512( hash0, hash1, vhash ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128_512( hash2, hash3, vhash ); + if ( i == 0 ) + { + intrlv_2x128( vhash, in0, in1, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash + (16<<1), 16 ); + dintrlv_2x128_512( hash2, hash3, vhash ); + } + else + { + intrlv_2x128( vhash, in0, in1, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); + } break; case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3 + 64, 16 ); + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3, size ); + } break; case SHAVITE: sph_shavite512_init( &ctx.shavite ); @@ -591,11 +779,16 @@ void x16r_4way_hash( void* output, const void* input ) (const BitSequence *)in3, size ); break; case HAMSI: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, size ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + if ( i == 0 ) + hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + hamsi512_4way_init( &ctx.hamsi ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_4way_close( &ctx.hamsi, vhash ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -613,31 +806,59 @@ void x16r_4way_hash( void* output, const void* input ) break; case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, size ); + if ( i == 0 ) + shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 ); + else + { + shabal512_4way_init( &ctx.shabal ); + shabal512_4way_update( &ctx.shabal, vhash, size ); + } shabal512_4way_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x16r_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } break; case SHA_512: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, size ); - sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + sha512_4way_init( &ctx.sha512 ); + if ( i == 0 ) + sha512_4way_update( &ctx.sha512, input, size ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + sha512_4way_init( &ctx.sha512 ); + sha512_4way_update( &ctx.sha512, vhash, size ); + } + sha512_4way_close( &ctx.sha512, vhash ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; } size = 64; @@ -651,23 +872,22 @@ void x16r_4way_hash( void* output, const void* input ) int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t hash[16*4] __attribute__ ((aligned (64))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t vdata2[20*4] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t bedata1[2] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; volatile uint8_t *restart = &(work_restart[thr_id].restart); - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - mm256_bswap32_intrlv80_4x64( vdata, pdata ); + if ( bench ) ptarget[7] = 0x0cff; bedata1[0] = bswap_32( pdata[1] ); bedata1[1] = bswap_32( pdata[2] ); @@ -680,24 +900,72 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime ); } + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + jh512_4way_init( &x16r_ctx.jh ); + jh512_4way_update( &x16r_ctx.jh, vdata, 64 ); + break; + case SKEIN: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + skein512_4way_init( &x16r_ctx.skein ); + skein512_4way_update( &x16r_ctx.skein, vdata, 64 ); + break; + case LUFFA: + mm128_bswap32_80( edata, pdata ); + intrlv_2x128( vdata2, edata, edata, 640 ); + luffa_2way_init( &x16r_ctx.luffa, 512 ); + luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 ); + rintrlv_2x128_4x64( vdata, vdata2, vdata2, 512 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + hamsi512_4way_init( &x16r_ctx.hamsi ); + hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm128_bswap32_intrlv80_4x32( vdata2, pdata ); + shabal512_4way_init( &x16r_ctx.shabal ); + shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 ); + rintrlv_4x32_4x64( vdata, vdata2, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16r_ctx.whirlpool ); + sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + default: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + } + + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); + do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x16r_4way_hash( hash, vdata ); - pdata[19] = n; - for ( int i = 0; i < 4; i++ ) - if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) - if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n += 4; } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index 6323589..c438c1e 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -1,5 +1,7 @@ #include "x16r-gate.h" +void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL; + void x16r_getAlgoString( const uint8_t* prevblock, char *output ) { char *sptr = output; diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 0457bd5..f86d069 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -50,7 +50,7 @@ enum x16r_Algo { X16R_HASH_FUNC_COUNT }; -void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ); +extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ); void x16r_getAlgoString( const uint8_t *prevblock, char *output ); void x16s_getAlgoString( const uint8_t *prevblock, char *output ); void x16rt_getAlgoString( const uint32_t *timeHash, char *output ); diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c index e50dc01..d6da77c 100644 --- a/algo/x16/x16rt-4way.c +++ b/algo/x16/x16rt-4way.c @@ -11,6 +11,7 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa-hash-2way.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/cubehash/cube-hash-2way.h" #include "algo/simd/simd-hash-2way.h" @@ -26,11 +27,11 @@ #include "algo/echo/echo-hash-4way.h" #endif +#if defined (X16RT_8WAY) + static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; -#if defined (X16RT_8WAY) - union _x16rt_8way_context_overlay { blake512_8way_context blake; @@ -39,7 +40,8 @@ union _x16rt_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cube_4way_context cube; + cubehashParam cube; +// cube_4way_context cube; simd_4way_context simd; hamsi512_8way_context hamsi; sph_fugue512_context fugue; @@ -59,18 +61,21 @@ union _x16rt_8way_context_overlay typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay; +static __thread x16rt_8way_context_overlay x16rt_ctx; + void x16rt_8way_hash( void* output, const void* input ) { - uint32_t vhash[24*8] __attribute__ ((aligned (128))); - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t hash4[24] __attribute__ ((aligned (64))); - uint32_t hash5[24] __attribute__ ((aligned (64))); - uint32_t hash6[24] __attribute__ ((aligned (64))); - uint32_t hash7[24] __attribute__ ((aligned (64))); + uint32_t vhash[20*8] __attribute__ ((aligned (128))); + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t hash4[20] __attribute__ ((aligned (64))); + uint32_t hash5[20] __attribute__ ((aligned (64))); + uint32_t hash6[20] __attribute__ ((aligned (64))); + uint32_t hash7[20] __attribute__ ((aligned (64))); x16rt_8way_context_overlay ctx; + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -92,18 +97,16 @@ void x16rt_8way_hash( void* output, const void* input ) switch ( algo ) { case BLAKE: - blake512_8way_init( &ctx.blake ); if ( i == 0 ) - blake512_8way_update( &ctx.blake, input, size ); + blake512_8way_full( &ctx.blake, vhash, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - blake512_8way_update( &ctx.blake, vhash, size ); + blake512_8way_full( &ctx.blake, vhash, vhash, size ); } - blake512_8way_close( &ctx.blake, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7, vhash ); break; case BMW: bmw512_8way_init( &ctx.bmw ); @@ -130,54 +133,24 @@ void x16rt_8way_hash( void* output, const void* input ) groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (const char*)in4, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (const char*)in5, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (const char*)in6, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (const char*)in7, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 ); #endif break; - case SKEIN: - skein512_8way_init( &ctx.skein ); - if ( i == 0 ) - skein512_8way_update( &ctx.skein, input, size ); - else - { - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - skein512_8way_update( &ctx.skein, vhash, size ); - } - skein512_8way_close( &ctx.skein, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); - break; case JH: - jh512_8way_init( &ctx.jh ); if ( i == 0 ) - jh512_8way_update( &ctx.jh, input, size ); + jh512_8way_update( &ctx.jh, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); + jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, size ); } jh512_8way_close( &ctx.jh, vhash ); @@ -198,35 +171,105 @@ void x16rt_8way_hash( void* output, const void* input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; + case SKEIN: + if ( i == 0 ) + skein512_8way_update( &ctx.skein, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, size ); + } + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; case LUFFA: - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + if ( i == 0 ) + { + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + } + else + { + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + } break; case CUBEHASH: - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)in3 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash4, + (const byte*)in4 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash5, + (const byte*)in5 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash6, + (const byte*)in6 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash7, + (const byte*)in7 + 64, 16 ); + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash4, + (const byte*)in4, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash5, + (const byte*)in5, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash6, + (const byte*)in6, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash7, + (const byte*)in7, size ); + } break; case SHAVITE: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size ); + shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size ); + shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else sph_shavite512_init( &ctx.shavite ); @@ -257,61 +300,53 @@ void x16rt_8way_hash( void* output, const void* input ) break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash4, - (const BitSequence*)in4, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash5, - (const BitSequence*)in5, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash6, - (const BitSequence*)in6, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash7, - (const BitSequence*)in7, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)in4, size ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)in5, size ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)in6, size ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)in7, size ); #endif - break; + break; case HAMSI: - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + if ( i == 0 ) + hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, size ); - hamsi512_8way_close( &ctx.hamsi, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - break; + break; case FUGUE: sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, in0, size ); @@ -340,48 +375,87 @@ void x16rt_8way_hash( void* output, const void* input ) break; case SHABAL: intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, size ); + size<<3 ); + if ( i == 0 ) + shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 ); + else + { + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, size ); + } shabal512_8way_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in4, size ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in5, size ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in6, size ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in7, size ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in4, size ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in5, size ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in6, size ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in7, size ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } break; case SHA_512: - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, size ); + if ( i == 0 ) + sha512_8way_update( &ctx.sha512, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + sha512_8way_update( &ctx.sha512, vhash, size ); + } sha512_8way_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); - break; + hash7, vhash ); + break; } size = 64; } @@ -399,23 +473,22 @@ void x16rt_8way_hash( void* output, const void* input ) int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t hash[8*16] __attribute__ ((aligned (128))); - uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t vdata2[20*8] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t _ALIGN(64) timeHash[8*8]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 8; uint32_t n = first_nonce; __m512i *noncev = (__m512i*)vdata + 9; // aligned - int thr_id = mythr->id; + const int thr_id = mythr->id; volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - mm512_bswap32_intrlv80_8x64( vdata, pdata ); + if ( bench ) ptarget[7] = 0x0cff; uint32_t ntime = bswap_32( pdata[17] ); if ( s_ntime != ntime ) @@ -428,31 +501,84 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce, hashOrder, ntime, timeHash ); } + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + jh512_8way_init( &x16rt_ctx.jh ); + jh512_8way_update( &x16rt_ctx.jh, vdata, 64 ); + break; + case SKEIN: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + skein512_8way_init( &x16rt_ctx.skein ); + skein512_8way_update( &x16rt_ctx.skein, vdata, 64 ); + break; + case LUFFA: + mm128_bswap32_80( edata, pdata ); + intrlv_4x128( vdata2, edata, edata, edata, edata, 640 ); + luffa_4way_init( &x16rt_ctx.luffa, 512 ); + luffa_4way_update( &x16rt_ctx.luffa, vdata2, 64 ); + rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x16rt_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16rt_ctx.cube, (const byte*)edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + hamsi512_8way_init( &x16rt_ctx.hamsi ); + hamsi512_8way_update( &x16rt_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm256_bswap32_intrlv80_8x32( vdata2, pdata ); + shabal512_8way_init( &x16rt_ctx.shabal ); + shabal512_8way_update( &x16rt_ctx.shabal, vdata2, 64 ); + rintrlv_8x32_8x64( vdata, vdata2, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16rt_ctx.whirlpool ); + sph_whirlpool( &x16rt_ctx.whirlpool, edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + default: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + } + + *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( + n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x16rt_8way_hash( hash, vdata ); - pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) - if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); n += 8; } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } #elif defined (X16RT_4WAY) +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; + union _x16rt_4way_context_overlay { blake512_4way_context blake; @@ -463,6 +589,7 @@ union _x16rt_4way_context_overlay jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; + hashState_luffa luffa1; cubehashParam cube; sph_shavite512_context shavite; simd_2way_context simd; @@ -474,14 +601,17 @@ union _x16rt_4way_context_overlay }; typedef union _x16rt_4way_context_overlay x16rt_4way_context_overlay; +static __thread x16rt_4way_context_overlay x16rt_ctx; + void x16rt_4way_hash( void* output, const void* input ) { - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t vhash[24*4] __attribute__ ((aligned (64))); + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t vhash[20*4] __attribute__ ((aligned (64))); x16rt_4way_context_overlay ctx; + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -500,15 +630,13 @@ void x16rt_4way_hash( void* output, const void* input ) switch ( algo ) { case BLAKE: - blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way_update( &ctx.blake, input, size ); + blake512_4way_full( &ctx.blake, vhash, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way_update( &ctx.blake, vhash, size ); + blake512_4way_full( &ctx.blake, vhash, vhash, size ); } - blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case BMW: @@ -524,38 +652,18 @@ void x16rt_4way_hash( void* output, const void* input ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - break; - case SKEIN: - skein512_4way_init( &ctx.skein ); - if ( i == 0 ) - skein512_4way_update( &ctx.skein, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way_update( &ctx.skein, vhash, size ); - } - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); break; case JH: - jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way_update( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input + (64<<2), 16 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); @@ -573,29 +681,74 @@ void x16rt_4way_hash( void* output, const void* input ) keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; + case SKEIN: + if ( i == 0 ) + skein512_4way_update( &ctx.skein, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + skein512_4way_init( &ctx.skein ); + skein512_4way_update( &ctx.skein, vhash, size ); + } + skein512_4way_close( &ctx.skein, vhash ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; case LUFFA: - intrlv_2x128( vhash, in0, in1, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_2x128( hash2, hash3, vhash, 512 ); + if ( i == 0 ) + { + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash0, + (const BitSequence*)in0 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash1, + (const BitSequence*)in1 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash2, + (const BitSequence*)in2 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash3, + (const BitSequence*)in3 + 64, 16 ); + } + else + { + intrlv_2x128( vhash, in0, in1, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); + } break; case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3 + 64, 16 ); + + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)in3, size ); + } break; case SHAVITE: sph_shavite512_init( &ctx.shavite ); @@ -622,25 +775,26 @@ void x16rt_4way_hash( void* output, const void* input ) dintrlv_2x128( hash2, hash3, vhash, 512 ); break; case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); break; case HAMSI: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, size ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + if ( i == 0 ) + hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + hamsi512_4way_init( &ctx.hamsi ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_4way_close( &ctx.hamsi, vhash ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -657,32 +811,59 @@ void x16rt_4way_hash( void* output, const void* input ) sph_fugue512_close( &ctx.fugue, hash3 ); break; case SHABAL: - intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, size ); - shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); + if ( i == 0 ) + shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 ); + else + { + shabal512_4way_init( &ctx.shabal ); + shabal512_4way_update( &ctx.shabal, vhash, size ); + } + shabal512_4way_close( &ctx.shabal, vhash ); + dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - break; + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x16rt_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } + break; case SHA_512: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, size ); - sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + sha512_4way_init( &ctx.sha512 ); + if ( i == 0 ) + sha512_4way_update( &ctx.sha512, input, size ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + sha512_4way_update( &ctx.sha512, vhash, size ); + } + sha512_4way_close( &ctx.sha512, vhash ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; } size = 64; @@ -698,21 +879,21 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, { uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t vdata32[20*4] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t _ALIGN(64) timeHash[4*8]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - int thr_id = mythr->id; + const int thr_id = mythr->id; __m256i *noncev = (__m256i*)vdata + 9; // aligned volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ptarget[7] = 0x0cff; + if ( bench ) ptarget[7] = 0x0cff; - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - uint32_t ntime = bswap_32( pdata[17] ); if ( s_ntime != ntime ) { @@ -724,24 +905,71 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce, hashOrder, ntime, timeHash ); } + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + jh512_4way_init( &x16rt_ctx.jh ); + jh512_4way_update( &x16rt_ctx.jh, vdata, 64 ); + break; + case SKEIN: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + skein512_4way_init( &x16rt_ctx.skein ); + skein512_4way_update( &x16rt_ctx.skein, vdata, 64 ); + break; + case LUFFA: + mm128_bswap32_80( edata, pdata ); + init_luffa( &x16rt_ctx.luffa1, 512 ); + update_luffa( &x16rt_ctx.luffa1, (const BitSequence*)edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x16rt_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16rt_ctx.cube, (const byte*)edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + hamsi512_4way_init( &x16rt_ctx.hamsi ); + hamsi512_4way_update( &x16rt_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm128_bswap32_intrlv80_4x32( vdata32, pdata ); + shabal512_4way_init( &x16rt_ctx.shabal ); + shabal512_4way_update( &x16rt_ctx.shabal, vdata32, 64 ); + rintrlv_4x32_4x64( vdata, vdata32, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16rt_ctx.whirlpool ); + sph_whirlpool( &x16rt_ctx.whirlpool, edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + default: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + } + + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); + do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x16rt_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) - if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + for ( int i = 0; i < 4; i++ ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; + } while ( ( n < last_nonce ) && !(*restart) ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index f1f2f08..e5812c4 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -33,11 +33,11 @@ #include "algo/echo/echo-hash-4way.h" #endif +#if defined (X16RV2_8WAY) + static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; -#if defined (X16RV2_8WAY) - union _x16rv2_8way_context_overlay { blake512_8way_context blake; @@ -46,7 +46,7 @@ union _x16rv2_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cube_4way_context cube; + cubehashParam cube; simd_4way_context simd; hamsi512_8way_context hamsi; sph_fugue512_context fugue; @@ -66,6 +66,7 @@ union _x16rv2_8way_context_overlay } __attribute__ ((aligned (64))); typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay; +static __thread x16rv2_8way_context_overlay x16rv2_ctx; void x16rv2_8way_hash( void* output, const void* input ) { @@ -79,6 +80,7 @@ void x16rv2_8way_hash( void* output, const void* input ) uint32_t hash6[24] __attribute__ ((aligned (64))); uint32_t hash7[24] __attribute__ ((aligned (64))); x16rv2_8way_context_overlay ctx; + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -102,16 +104,15 @@ void x16rv2_8way_hash( void* output, const void* input ) case BLAKE: blake512_8way_init( &ctx.blake ); if ( i == 0 ) - blake512_8way_update( &ctx.blake, input, size ); + blake512_8way_full( &ctx.blake, vhash, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - blake512_8way_update( &ctx.blake, vhash, size ); + blake512_8way_full( &ctx.blake, vhash, vhash, size ); } - blake512_8way_close( &ctx.blake, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7, vhash ); break; case BMW: bmw512_8way_init( &ctx.bmw ); @@ -130,62 +131,30 @@ void x16rv2_8way_hash( void* output, const void* input ) case GROESTL: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + groestl512_4way_full( &ctx.groestl, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + groestl512_4way_full( &ctx.groestl, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (const char*)in4, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (const char*)in5, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (const char*)in6, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (const char*)in7, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 ); #endif - break; - case SKEIN: - skein512_8way_init( &ctx.skein ); - if ( i == 0 ) - skein512_8way_update( &ctx.skein, input, size ); - else - { - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - skein512_8way_update( &ctx.skein, vhash, size ); - } - skein512_8way_close( &ctx.skein, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); break; case JH: - jh512_8way_init( &ctx.jh ); if ( i == 0 ) - jh512_8way_update( &ctx.jh, input, size ); + jh512_8way_update( &ctx.jh, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); + jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, size ); } jh512_8way_close( &ctx.jh, vhash ); @@ -193,6 +162,35 @@ void x16rv2_8way_hash( void* output, const void* input ) hash7, vhash ); break; case KECCAK: + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in4 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash4 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in5 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash5 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in6 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash6 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in7 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash7 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + } + else + { sph_tiger_init( &ctx.tiger ); sph_tiger( &ctx.tiger, in0, size ); sph_tiger_close( &ctx.tiger, hash0 ); @@ -217,6 +215,7 @@ void x16rv2_8way_hash( void* output, const void* input ) sph_tiger_init( &ctx.tiger ); sph_tiger( &ctx.tiger, in7, size ); sph_tiger_close( &ctx.tiger, hash7 ); + } for ( int i = (24/4); i < (64/4); i++ ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = @@ -230,64 +229,149 @@ void x16rv2_8way_hash( void* output, const void* input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; + case SKEIN: + if ( i == 0 ) + skein512_8way_update( &ctx.skein, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, size ); + } + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; case LUFFA: - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in0, size ); - sph_tiger_close( &ctx.tiger, hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in1, size ); - sph_tiger_close( &ctx.tiger, hash1 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in2, size ); - sph_tiger_close( &ctx.tiger, hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in3, size ); - sph_tiger_close( &ctx.tiger, hash3 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in4, size ); - sph_tiger_close( &ctx.tiger, hash4 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in5, size ); - sph_tiger_close( &ctx.tiger, hash5 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in6, size ); - sph_tiger_close( &ctx.tiger, hash6 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in7, size ); - sph_tiger_close( &ctx.tiger, hash7 ); + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in4 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash4 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in5 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash5 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in6 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash6 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in7 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash7 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in4, size ); + sph_tiger_close( &ctx.tiger, hash4 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in5, size ); + sph_tiger_close( &ctx.tiger, hash5 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in6, size ); + sph_tiger_close( &ctx.tiger, hash6 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in7, size ); + sph_tiger_close( &ctx.tiger, hash7 ); + } for ( int i = (24/4); i < (64/4); i++ ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0; intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, 64 ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, 64 ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case CUBEHASH: - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)in3 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash4, + (const byte*)in4 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash5, + (const byte*)in5 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash6, + (const byte*)in6 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash7, + (const byte*)in7 + 64, 16 ); + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash4, + (const byte*)in4, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash5, + (const byte*)in5, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash6, + (const byte*)in6, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash7, + (const byte*)in7, size ); + } break; case SHAVITE: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size ); + shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - shavite512_4way_init( &ctx.shavite ); - shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size ); + shavite512_4way_full( &ctx.shavite, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else sph_shavite512_init( &ctx.shavite ); @@ -315,100 +399,126 @@ void x16rv2_8way_hash( void* output, const void* input ) sph_shavite512( &ctx.shavite, in7, size ); sph_shavite512_close( &ctx.shavite, hash7 ); #endif - break; + break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash4, - (const BitSequence*)in4, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash5, - (const BitSequence*)in5, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash6, - (const BitSequence*)in6, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash7, - (const BitSequence*)in7, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)in4, size ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)in5, size ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)in6, size ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)in7, size ); #endif - break; + break; case HAMSI: - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + if ( i == 0 ) + hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, size ); - hamsi512_8way_close( &ctx.hamsi, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - break; + break; case FUGUE: - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in0, size ); - sph_fugue512_close( &ctx.fugue, hash0 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in1, size ); - sph_fugue512_close( &ctx.fugue, hash1 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in2, size ); - sph_fugue512_close( &ctx.fugue, hash2 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in3, size ); - sph_fugue512_close( &ctx.fugue, hash3 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in4, size ); - sph_fugue512_close( &ctx.fugue, hash4 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in5, size ); - sph_fugue512_close( &ctx.fugue, hash5 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in6, size ); - sph_fugue512_close( &ctx.fugue, hash6 ); - sph_fugue512_init( &ctx.fugue ); - sph_fugue512( &ctx.fugue, in7, size ); - sph_fugue512_close( &ctx.fugue, hash7 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in0, size ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in1, size ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in2, size ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in3, size ); + sph_fugue512_close( &ctx.fugue, hash3 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in4, size ); + sph_fugue512_close( &ctx.fugue, hash4 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in5, size ); + sph_fugue512_close( &ctx.fugue, hash5 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in6, size ); + sph_fugue512_close( &ctx.fugue, hash6 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in7, size ); + sph_fugue512_close( &ctx.fugue, hash7 ); break; case SHABAL: - intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, size ); - shabal512_8way_close( &ctx.shabal, vhash ); - dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); + intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + if ( i == 0 ) + shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 ); + else + { + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, size ); + } + shabal512_8way_close( &ctx.shabal, vhash ); + dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); break; case WHIRLPOOL: + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } + else + { sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, in0, size ); sph_whirlpool_close( &ctx.whirlpool, hash0 ); @@ -433,8 +543,38 @@ void x16rv2_8way_hash( void* output, const void* input ) sph_whirlpool_init( &ctx.whirlpool ); sph_whirlpool( &ctx.whirlpool, in7, size ); sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } break; case SHA_512: + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in4 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash4 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in5 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash5 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in6 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash6 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in7 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash7 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + } + else + { sph_tiger_init( &ctx.tiger ); sph_tiger( &ctx.tiger, in0, size ); sph_tiger_close( &ctx.tiger, hash0 ); @@ -459,6 +599,7 @@ void x16rv2_8way_hash( void* output, const void* input ) sph_tiger_init( &ctx.tiger ); sph_tiger( &ctx.tiger, in7, size ); sph_tiger_close( &ctx.tiger, hash7 ); + } for ( int i = (24/4); i < (64/4); i++ ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = @@ -489,21 +630,22 @@ void x16rv2_8way_hash( void* output, const void* input ) int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t hash[8*16] __attribute__ ((aligned (128))); - uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t vdata2[20*8] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t bedata1[2] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 8; uint32_t n = first_nonce; __m512i *noncev = (__m512i*)vdata + 9; // aligned - int thr_id = mythr->id; + const int thr_id = mythr->id; volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ptarget[7] = 0x0cff; + if ( bench ) ptarget[7] = 0x0cff; mm512_bswap32_intrlv80_8x64( vdata, pdata ); @@ -515,34 +657,89 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder ); s_ntime = ntime; if ( opt_debug && !thr_id ) - applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime ); } + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + jh512_8way_init( &x16rv2_ctx.jh ); + jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 ); + break; + case KECCAK: + case LUFFA: + case SHA_512: + mm128_bswap32_80( edata, pdata ); + sph_tiger_init( &x16rv2_ctx.tiger ); + sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + case SKEIN: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + skein512_8way_init( &x16rv2_ctx.skein ); + skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + hamsi512_8way_init( &x16rv2_ctx.hamsi ); + hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm256_bswap32_intrlv80_8x32( vdata2, pdata ); + shabal512_8way_init( &x16rv2_ctx.shabal ); + shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 ); + rintrlv_8x32_8x64( vdata, vdata2, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16rv2_ctx.whirlpool ); + sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + default: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + } + + *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( + n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x16rv2_8way_hash( hash, vdata ); - pdata[19] = n; for ( int i = 0; i < 8; i++ ) - if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) - if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); n += 8; } while ( likely( ( n < last_nonce ) && !(*restart) ) ); - + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } #elif defined (X16RV2_4WAY) +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; + union _x16rv2_4way_context_overlay { blake512_4way_context blake; @@ -565,6 +762,8 @@ union _x16rv2_4way_context_overlay }; typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay; +static __thread x16rv2_4way_context_overlay x16rv2_ctx; + // Pad the 24 bytes tiger hash to 64 bytes inline void padtiger512( uint32_t* hash ) { @@ -573,12 +772,13 @@ inline void padtiger512( uint32_t* hash ) void x16rv2_4way_hash( void* output, const void* input ) { - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t vhash[24*4] __attribute__ ((aligned (64))); + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t vhash[20*4] __attribute__ ((aligned (64))); x16rv2_4way_context_overlay ctx; + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -595,16 +795,14 @@ void x16rv2_4way_hash( void* output, const void* input ) switch ( algo ) { case BLAKE: - blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way_update( &ctx.blake, input, size ); + blake512_4way_full( &ctx.blake, vhash, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way_update( &ctx.blake, vhash, size ); + blake512_4way_full( &ctx.blake, vhash, vhash, size ); } - blake512_4way_close( &ctx.blake, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case BMW: bmw512_4way_init( &ctx.bmw ); @@ -616,60 +814,56 @@ void x16rv2_4way_hash( void* output, const void* input ) bmw512_4way_update( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - break; - case SKEIN: - skein512_4way_init( &ctx.skein ); - if ( i == 0 ) - skein512_4way_update( &ctx.skein, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way_update( &ctx.skein, vhash, size ); - } - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); break; case JH: - jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way_update( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input + (64<<2), 16 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case KECCAK: - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in0, size ); - sph_tiger_close( &ctx.tiger, hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in1, size ); - sph_tiger_close( &ctx.tiger, hash1 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in2, size ); - sph_tiger_close( &ctx.tiger, hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in3, size ); - sph_tiger_close( &ctx.tiger, hash3 ); - + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + } for ( int i = (24/4); i < (64/4); i++ ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0; @@ -679,95 +873,134 @@ void x16rv2_4way_hash( void* output, const void* input ) keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; + case SKEIN: + if ( i == 0 ) + skein512_4way_update( &ctx.skein, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + skein512_4way_init( &ctx.skein ); + skein512_4way_update( &ctx.skein, vhash, size ); + } + skein512_4way_close( &ctx.skein, vhash ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); + break; case LUFFA: - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in0, size ); - sph_tiger_close( &ctx.tiger, hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in1, size ); - sph_tiger_close( &ctx.tiger, hash1 ); - + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + } for ( int i = (24/4); i < (64/4); i++ ) - hash0[i] = hash1[i] = 0; + hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0; intrlv_2x128( vhash, hash0, hash1, 512 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); dintrlv_2x128( hash0, hash1, vhash, 512 ); - - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in2, size ); - sph_tiger_close( &ctx.tiger, hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in3, size ); - sph_tiger_close( &ctx.tiger, hash3 ); - - for ( int i = (24/4); i < (64/4); i++ ) - hash2[i] = hash3[i] = 0; - intrlv_2x128( vhash, hash2, hash3, 512 ); luffa_2way_init( &ctx.luffa, 512 ); luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); dintrlv_2x128( hash2, hash3, vhash, 512 ); break; case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3 + 64, 16 ); + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3, size ); + } break; case SHAVITE: - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in0, size ); - sph_shavite512_close( &ctx.shavite, hash0 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in1, size ); - sph_shavite512_close( &ctx.shavite, hash1 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in2, size ); - sph_shavite512_close( &ctx.shavite, hash2 ); - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, in3, size ); - sph_shavite512_close( &ctx.shavite, hash3 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in0, size ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in1, size ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in2, size ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in3, size ); + sph_shavite512_close( &ctx.shavite, hash3 ); break; case SIMD: - intrlv_2x128( vhash, in0, in1, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - simd_2way_init( &ctx.simd, 512 ); - simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); - dintrlv_2x128( hash2, hash3, vhash, 512 ); + intrlv_2x128( vhash, in0, in1, size<<3 ); + simd512_2way_full( &ctx.simd, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + simd512_2way_full( &ctx.simd, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); break; case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); break; case HAMSI: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, size ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + if ( i == 0 ) + hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + hamsi512_4way_init( &ctx.hamsi ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_4way_close( &ctx.hamsi, vhash ); + dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -785,39 +1018,77 @@ void x16rv2_4way_hash( void* output, const void* input ) break; case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, size ); + if ( i == 0 ) + shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 ); + else + { + shabal512_4way_init( &ctx.shabal ); + shabal512_4way_update( &ctx.shabal, vhash, size ); + } shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - break; - case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); + break; + case WHIRLPOOL: + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } break; case SHA_512: - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in0, size ); - sph_tiger_close( &ctx.tiger, hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in1, size ); - sph_tiger_close( &ctx.tiger, hash1 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in2, size ); - sph_tiger_close( &ctx.tiger, hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in3, size ); - sph_tiger_close( &ctx.tiger, hash3 ); - + if ( i == 0 ) + { + sph_tiger( &ctx.tiger, in0 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash0 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in1 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash1 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in2 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash2 ); + memcpy( &ctx, &x16rv2_ctx, sizeof(ctx) ); + sph_tiger( &ctx.tiger, in3 + 64, 16 ); + sph_tiger_close( &ctx.tiger, hash3 ); + } + else + { + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + } for ( int i = (24/4); i < (64/4); i++ ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0; @@ -841,20 +1112,21 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, { uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t vdata32[20*4] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t bedata1[2] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - int thr_id = mythr->id; + const int thr_id = mythr->id; __m256i *noncev = (__m256i*)vdata + 9; // aligned volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ptarget[7] = 0x0fff; + if ( bench ) ptarget[7] = 0x0fff; - mm256_bswap32_intrlv80_4x64( vdata, pdata ); bedata1[0] = bswap_32( pdata[1] ); bedata1[1] = bswap_32( pdata[2] ); @@ -867,25 +1139,74 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); } + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + jh512_4way_init( &x16rv2_ctx.jh ); + jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 ); + break; + case KECCAK: + case LUFFA: + case SHA_512: + mm128_bswap32_80( edata, pdata ); + sph_tiger_init( &x16rv2_ctx.tiger ); + sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + case SKEIN: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + skein512_4way_init( &x16rv2_ctx.skein ); + skein512_4way_update( &x16rv2_ctx.skein, vdata, 64 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + hamsi512_4way_init( &x16rv2_ctx.hamsi ); + hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm128_bswap32_intrlv80_4x32( vdata32, pdata ); + shabal512_4way_init( &x16rv2_ctx.shabal ); + shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 ); + rintrlv_4x32_4x64( vdata, vdata32, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x16rv2_ctx.whirlpool ); + sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + default: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + } + + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); + do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x16rv2_4way_hash( hash, vdata ); - pdata[19] = n; - for ( int i = 0; i < 4; i++ ) - if ( unlikely( (hash+(i<<3))[7] <= Htarg ) ) - if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( likely( ( n < max_nonce ) && !(*restart) ) ); - - *hashes_done = n - first_nonce + 1; + } while ( likely( ( n < last_nonce ) && !(*restart) ) ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 1dc9cee..e84163c 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -17,6 +17,7 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa-hash-2way.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/cubehash/cube-hash-2way.h" #include "algo/simd/simd-hash-2way.h" @@ -58,7 +59,8 @@ union _x21s_8way_context_overlay jh512_8way_context jh; keccak512_8way_context keccak; luffa_4way_context luffa; - cube_4way_context cube; + cubehashParam cube; +// cube_4way_context cube; simd_4way_context simd; hamsi512_8way_context hamsi; sph_fugue512_context fugue; @@ -82,18 +84,21 @@ union _x21s_8way_context_overlay typedef union _x21s_8way_context_overlay x21s_8way_context_overlay; +static __thread x21s_8way_context_overlay x21s_ctx; + void x21s_8way_hash( void* output, const void* input ) { - uint32_t vhash[24*8] __attribute__ ((aligned (128))); - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t hash4[24] __attribute__ ((aligned (64))); - uint32_t hash5[24] __attribute__ ((aligned (64))); - uint32_t hash6[24] __attribute__ ((aligned (64))); - uint32_t hash7[24] __attribute__ ((aligned (64))); + uint32_t vhash[20*8] __attribute__ ((aligned (128))); + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t hash4[20] __attribute__ ((aligned (64))); + uint32_t hash5[20] __attribute__ ((aligned (64))); + uint32_t hash6[20] __attribute__ ((aligned (64))); + uint32_t hash7[20] __attribute__ ((aligned (64))); x21s_8way_context_overlay ctx; + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -115,18 +120,16 @@ void x21s_8way_hash( void* output, const void* input ) switch ( algo ) { case BLAKE: - blake512_8way_init( &ctx.blake ); if ( i == 0 ) - blake512_8way_update( &ctx.blake, input, size ); + blake512_8way_full( &ctx.blake, vhash, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - blake512_8way_update( &ctx.blake, vhash, size ); + blake512_8way_full( &ctx.blake, vhash, vhash, size ); } - blake512_8way_close( &ctx.blake, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, + hash6, hash7, vhash ); break; case BMW: bmw512_8way_init( &ctx.bmw ); @@ -145,62 +148,30 @@ void x21s_8way_hash( void* output, const void* input ) case GROESTL: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + groestl512_4way_full( &ctx.groestl, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - groestl512_4way_init( &ctx.groestl, 64 ); - groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 ); + groestl512_4way_full( &ctx.groestl, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash4, - (const char*)in4, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash5, - (const char*)in5, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash6, - (const char*)in6, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash7, - (const char*)in7, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 ); #endif - break; - case SKEIN: - skein512_8way_init( &ctx.skein ); - if ( i == 0 ) - skein512_8way_update( &ctx.skein, input, size ); - else - { - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - skein512_8way_update( &ctx.skein, vhash, size ); - } - skein512_8way_close( &ctx.skein, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); break; case JH: - jh512_8way_init( &ctx.jh ); if ( i == 0 ) - jh512_8way_update( &ctx.jh, input, size ); + jh512_8way_update( &ctx.jh, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); + jh512_8way_init( &ctx.jh ); jh512_8way_update( &ctx.jh, vhash, size ); } jh512_8way_close( &ctx.jh, vhash ); @@ -221,25 +192,97 @@ void x21s_8way_hash( void* output, const void* input ) dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; + case SKEIN: + if ( i == 0 ) + skein512_8way_update( &ctx.skein, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + skein512_8way_init( &ctx.skein ); + skein512_8way_update( &ctx.skein, vhash, size ); + } + skein512_8way_close( &ctx.skein, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hash7, vhash ); + break; case LUFFA: - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - luffa_4way_init( &ctx.luffa, 512 ); - luffa_4way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + if ( i == 0 ) + { + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa_4way_update_close( &ctx.luffa, vhash, + vhash + (16<<2), 16 ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + } + else + { + intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); + intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); + luffa512_4way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + } break; case CUBEHASH: - intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); - dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); - intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - cube_4way_init( &ctx.cube, 512, 16, 32 ); - cube_4way_update_close( &ctx.cube, vhash, vhash, size ); - dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)in3 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash4, + (const byte*)in4 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash5, + (const byte*)in5 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash6, + (const byte*)in6 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash7, + (const byte*)in7 + 64, 16 ); + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash4, + (const byte*)in4, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash5, + (const byte*)in5, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash6, + (const byte*)in6, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash7, + (const byte*)in7, size ); + } break; case SHAVITE: #if defined(__VAES__) @@ -277,64 +320,56 @@ void x21s_8way_hash( void* output, const void* input ) sph_shavite512( &ctx.shavite, in7, size ); sph_shavite512_close( &ctx.shavite, hash7 ); #endif - break; + break; case SIMD: intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - simd_4way_init( &ctx.simd, 512 ); - simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + simd512_4way_full( &ctx.simd, vhash, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); break; case ECHO: #if defined(__VAES__) intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash ); intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 ); - echo_4way_init( &ctx.echo, 512 ); - echo_4way_update_close( &ctx.echo, vhash, vhash, size<<3 ); + echo_4way_full( &ctx.echo, vhash, 512, vhash, size ); dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash ); #else - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash4, - (const BitSequence*)in4, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash5, - (const BitSequence*)in5, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash6, - (const BitSequence*)in6, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash7, - (const BitSequence*)in7, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); + echo_full( &ctx.echo, (BitSequence *)hash4, 512, + (const BitSequence *)in4, size ); + echo_full( &ctx.echo, (BitSequence *)hash5, 512, + (const BitSequence *)in5, size ); + echo_full( &ctx.echo, (BitSequence *)hash6, 512, + (const BitSequence *)in6, size ); + echo_full( &ctx.echo, (BitSequence *)hash7, 512, + (const BitSequence *)in7, size ); #endif - break; + break; case HAMSI: - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + if ( i == 0 ) + hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, size ); - hamsi512_8way_close( &ctx.hamsi, vhash ); - dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, + hamsi512_8way_init( &ctx.hamsi ); + hamsi512_8way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_8way_close( &ctx.hamsi, vhash ); + dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); - break; + break; case FUGUE: sph_fugue512_init( &ctx.fugue ); sph_fugue512( &ctx.fugue, in0, size ); @@ -363,48 +398,87 @@ void x21s_8way_hash( void* output, const void* input ) break; case SHABAL: intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, size ); + size<<3 ); + if ( i == 0 ) + shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 ); + else + { + shabal512_8way_init( &ctx.shabal ); + shabal512_8way_update( &ctx.shabal, vhash, size ); + } shabal512_8way_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in4, size ); - sph_whirlpool_close( &ctx.whirlpool, hash4 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in5, size ); - sph_whirlpool_close( &ctx.whirlpool, hash5 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in6, size ); - sph_whirlpool_close( &ctx.whirlpool, hash6 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in7, size ); - sph_whirlpool_close( &ctx.whirlpool, hash7 ); + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in4, size ); + sph_whirlpool_close( &ctx.whirlpool, hash4 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in5, size ); + sph_whirlpool_close( &ctx.whirlpool, hash5 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in6, size ); + sph_whirlpool_close( &ctx.whirlpool, hash6 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in7, size ); + sph_whirlpool_close( &ctx.whirlpool, hash7 ); + } break; case SHA_512: - intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, - size<<3 ); sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, size ); + if ( i == 0 ) + sha512_8way_update( &ctx.sha512, input, size ); + else + { + intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, + size<<3 ); + sha512_8way_update( &ctx.sha512, vhash, size ); + } sha512_8way_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, - hash7, vhash ); - break; + hash7, vhash ); + break; } size = 64; } @@ -492,8 +566,10 @@ void x21s_8way_hash( void* output, const void* input ) int scanhash_x21s_8way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t hash[8*16] __attribute__ ((aligned (128))); - uint32_t vdata[24*8] __attribute__ ((aligned (64))); + uint32_t hash[16*8] __attribute__ ((aligned (128))); + uint32_t vdata[20*8] __attribute__ ((aligned (64))); + uint32_t vdata2[20*8] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t *hash7 = &hash[7<<3]; uint32_t lane_hash[8] __attribute__ ((aligned (64))); uint32_t bedata1[2] __attribute__((aligned(64))); @@ -503,14 +579,12 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; const uint32_t last_nonce = max_nonce - 16; - int thr_id = mythr->id; + const int thr_id = mythr->id; __m512i *noncev = (__m512i*)vdata + 9; // aligned volatile uint8_t *restart = &(work_restart[thr_id].restart); + const bool bench = opt_benchmark; - if ( opt_benchmark ) - ptarget[7] = 0x0cff; - - mm512_bswap32_intrlv80_8x64( vdata, pdata ); + if ( bench ) ptarget[7] = 0x0cff; bedata1[0] = bswap_32( pdata[1] ); bedata1[1] = bswap_32( pdata[2] ); @@ -523,28 +597,81 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce, applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime ); } + // Do midstate prehash on hash functions with block size <= 64 bytes. + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + switch ( algo ) + { + case JH: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + jh512_8way_init( &x21s_ctx.jh ); + jh512_8way_update( &x21s_ctx.jh, vdata, 64 ); + break; + case SKEIN: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + skein512_8way_init( &x21s_ctx.skein ); + skein512_8way_update( &x21s_ctx.skein, vdata, 64 ); + break; + case LUFFA: + mm128_bswap32_80( edata, pdata ); + intrlv_4x128( vdata2, edata, edata, edata, edata, 640 ); + luffa_4way_init( &x21s_ctx.luffa, 512 ); + luffa_4way_update( &x21s_ctx.luffa, vdata2, 64 ); + rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x21s_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x21s_ctx.cube, (const byte*)edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + hamsi512_8way_init( &x21s_ctx.hamsi ); + hamsi512_8way_update( &x21s_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm256_bswap32_intrlv80_8x32( vdata2, pdata ); + shabal512_8way_init( &x21s_ctx.shabal ); + shabal512_8way_update( &x21s_ctx.shabal, vdata2, 64 ); + rintrlv_8x32_8x64( vdata, vdata2, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x21s_ctx.whirlpool ); + sph_whirlpool( &x21s_ctx.whirlpool, edata, 64 ); + intrlv_8x64( vdata, edata, edata, edata, edata, + edata, edata, edata, edata, 640 ); + break; + default: + mm512_bswap32_intrlv80_8x64( vdata, pdata ); + } + + *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( + n+7, 0, n+6, 0, n+5, 0, n+4, 0, + n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); + + do { - *noncev = mm512_intrlv_blend_32( mm512_bswap_32( - _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, - n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x21s_8way_hash( hash, vdata ); - pdata[19] = n; for ( int lane = 0; lane < 8; lane++ ) if ( unlikely( hash7[lane] <= Htarg ) ) { extr_lane_8x32( lane_hash, hash, lane, 256 ); - if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) ) + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) { - pdata[19] = n + lane; + pdata[19] = bswap_32( n + lane ); submit_lane_solution( work, lane_hash, mythr, lane ); } } + *noncev = _mm512_add_epi32( *noncev, + m512_const1_64( 0x0000000800000000 ) ); n += 8; } while ( ( n < last_nonce ) && !(*restart) ); - + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } @@ -573,6 +700,7 @@ union _x21s_4way_context_overlay jh512_4way_context jh; keccak512_4way_context keccak; luffa_2way_context luffa; + hashState_luffa luffa1; cubehashParam cube; sph_shavite512_context shavite; simd_2way_context simd; @@ -589,17 +717,20 @@ union _x21s_4way_context_overlay #else sha256_4way_context sha256; #endif -}; +} __attribute__ ((aligned (64))); typedef union _x21s_4way_context_overlay x21s_4way_context_overlay; +static __thread x21s_4way_context_overlay x21s_ctx; + void x21s_4way_hash( void* output, const void* input ) { - uint32_t hash0[24] __attribute__ ((aligned (64))); - uint32_t hash1[24] __attribute__ ((aligned (64))); - uint32_t hash2[24] __attribute__ ((aligned (64))); - uint32_t hash3[24] __attribute__ ((aligned (64))); - uint32_t vhash[24*4] __attribute__ ((aligned (64))); + uint32_t hash0[20] __attribute__ ((aligned (64))); + uint32_t hash1[20] __attribute__ ((aligned (64))); + uint32_t hash2[20] __attribute__ ((aligned (64))); + uint32_t hash3[20] __attribute__ ((aligned (64))); + uint32_t vhash[20*4] __attribute__ ((aligned (64))); x21s_4way_context_overlay ctx; + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); void *in0 = (void*) hash0; void *in1 = (void*) hash1; void *in2 = (void*) hash2; @@ -624,15 +755,13 @@ void x21s_4way_hash( void* output, const void* input ) switch ( algo ) { case BLAKE: - blake512_4way_init( &ctx.blake ); if ( i == 0 ) - blake512_4way_update( &ctx.blake, input, size ); + blake512_4way_full( &ctx.blake, vhash, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way_update( &ctx.blake, vhash, size ); + blake512_4way_full( &ctx.blake, vhash, vhash, size ); } - blake512_4way_close( &ctx.blake, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case BMW: @@ -648,38 +777,18 @@ void x21s_4way_hash( void* output, const void* input ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case GROESTL: - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash0, - (const char*)in0, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash1, - (const char*)in1, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash2, - (const char*)in2, size<<3 ); - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash3, - (const char*)in3, size<<3 ); - break; - case SKEIN: - skein512_4way_init( &ctx.skein ); - if ( i == 0 ) - skein512_4way_update( &ctx.skein, input, size ); - else - { - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way_update( &ctx.skein, vhash, size ); - } - skein512_4way_close( &ctx.skein, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 ); + groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 ); break; case JH: - jh512_4way_init( &ctx.jh ); if ( i == 0 ) - jh512_4way_update( &ctx.jh, input, size ); + jh512_4way_update( &ctx.jh, input + (64<<2), 16 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + jh512_4way_init( &ctx.jh ); jh512_4way_update( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); @@ -697,29 +806,74 @@ void x21s_4way_hash( void* output, const void* input ) keccak512_4way_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; + case SKEIN: + if ( i == 0 ) + skein512_4way_update( &ctx.skein, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + skein512_4way_init( &ctx.skein ); + skein512_4way_update( &ctx.skein, vhash, size ); + } + skein512_4way_close( &ctx.skein, vhash ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; case LUFFA: - intrlv_2x128( vhash, in0, in1, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); - dintrlv_2x128( hash0, hash1, vhash, 512 ); - intrlv_2x128( vhash, in2, in3, size<<3 ); - luffa_2way_init( &ctx.luffa, 512 ); - luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); - dintrlv_2x128( hash2, hash3, vhash, 512 ); + if ( i == 0 ) + { + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash0, + (const BitSequence*)in0 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash1, + (const BitSequence*)in1 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash2, + (const BitSequence*)in2 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash3, + (const BitSequence*)in3 + 64, 16 ); + } + else + { + intrlv_2x128( vhash, in0, in1, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_2x128_512( hash0, hash1, vhash ); + intrlv_2x128( vhash, in2, in3, size<<3 ); + luffa512_2way_full( &ctx.luffa, vhash, vhash, size ); + dintrlv_2x128_512( hash2, hash3, vhash ); + } break; case CUBEHASH: - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash0, - (const byte*)in0, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash1, - (const byte*)in1, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash2, - (const byte*)in2, size ); - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash3, - (const byte*)in3, size ); + if ( i == 0 ) + { + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2 + 64, 16 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3 + 64, 16 ); + + } + else + { + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*)in0, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, + (const byte*)in1, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash2, + (const byte*)in2, size ); + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash3, + (const byte*)in3, size ); + } break; case SHAVITE: sph_shavite512_init( &ctx.shavite ); @@ -746,25 +900,26 @@ void x21s_4way_hash( void* output, const void* input ) dintrlv_2x128( hash2, hash3, vhash, 512 ); break; case ECHO: - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); + echo_full( &ctx.echo, (BitSequence *)hash0, 512, + (const BitSequence *)in0, size ); + echo_full( &ctx.echo, (BitSequence *)hash1, 512, + (const BitSequence *)in1, size ); + echo_full( &ctx.echo, (BitSequence *)hash2, 512, + (const BitSequence *)in2, size ); + echo_full( &ctx.echo, (BitSequence *)hash3, 512, + (const BitSequence *)in3, size ); break; case HAMSI: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, size ); - hamsi512_4way_close( &ctx.hamsi, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + if ( i == 0 ) + hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + hamsi512_4way_init( &ctx.hamsi ); + hamsi512_4way_update( &ctx.hamsi, vhash, size ); + } + hamsi512_4way_close( &ctx.hamsi, vhash ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -781,32 +936,59 @@ void x21s_4way_hash( void* output, const void* input ) sph_fugue512_close( &ctx.fugue, hash3 ); break; case SHABAL: - intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, size ); - shabal512_4way_close( &ctx.shabal, vhash ); - dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); + if ( i == 0 ) + shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 ); + else + { + shabal512_4way_init( &ctx.shabal ); + shabal512_4way_update( &ctx.shabal, vhash, size ); + } + shabal512_4way_close( &ctx.shabal, vhash ); + dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); break; case WHIRLPOOL: - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in0, size ); - sph_whirlpool_close( &ctx.whirlpool, hash0 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in1, size ); - sph_whirlpool_close( &ctx.whirlpool, hash1 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in2, size ); - sph_whirlpool_close( &ctx.whirlpool, hash2 ); - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, in3, size ); - sph_whirlpool_close( &ctx.whirlpool, hash3 ); + if ( i == 0 ) + { + sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + memcpy( &ctx, &x21s_ctx, sizeof(ctx) ); + sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } + else + { + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + } break; case SHA_512: - intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, size ); - sha512_4way_close( &ctx.sha512, vhash ); - dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + sha512_4way_init( &ctx.sha512 ); + if ( i == 0 ) + sha512_4way_update( &ctx.sha512, input, size ); + else + { + intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); + sha512_4way_update( &ctx.sha512, vhash, size ); + } + sha512_4way_close( &ctx.sha512, vhash ); + dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; } size = 64; @@ -889,23 +1071,23 @@ void x21s_4way_hash( void* output, const void* input ) int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) { - uint32_t hash[4*16] __attribute__ ((aligned (64))); - uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t hash[16*4] __attribute__ ((aligned (64))); + uint32_t vdata[20*4] __attribute__ ((aligned (64))); + uint32_t vdata32[20*4] __attribute__ ((aligned (64))); + uint32_t edata[20] __attribute__ ((aligned (64))); uint32_t bedata1[2] __attribute__((aligned(64))); uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - int thr_id = mythr->id; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; __m256i *noncev = (__m256i*)vdata + 9; // aligned volatile uint8_t *restart = &(work_restart[thr_id].restart); - if ( opt_benchmark ) - ptarget[7] = 0x0cff; + if ( bench ) ptarget[7] = 0x0cff; - mm256_bswap32_intrlv80_4x64( vdata, pdata ); - bedata1[0] = bswap_32( pdata[1] ); bedata1[1] = bswap_32( pdata[2] ); uint32_t ntime = bswap_32( pdata[17] ); @@ -916,25 +1098,73 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce, if ( opt_debug && !thr_id ) applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); } + + const char elem = hashOrder[0]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case JH: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + jh512_4way_init( &x21s_ctx.jh ); + jh512_4way_update( &x21s_ctx.jh, vdata, 64 ); + break; + case SKEIN: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + skein512_4way_init( &x21s_ctx.skein ); + skein512_4way_update( &x21s_ctx.skein, vdata, 64 ); + break; + case LUFFA: + mm128_bswap32_80( edata, pdata ); + init_luffa( &x21s_ctx.luffa1, 512 ); + update_luffa( &x21s_ctx.luffa1, (const BitSequence*)edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + case CUBEHASH: + mm128_bswap32_80( edata, pdata ); + cubehashInit( &x21s_ctx.cube, 512, 16, 32 ); + cubehashUpdate( &x21s_ctx.cube, (const byte*)edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + case HAMSI: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + hamsi512_4way_init( &x21s_ctx.hamsi ); + hamsi512_4way_update( &x21s_ctx.hamsi, vdata, 64 ); + break; + case SHABAL: + mm128_bswap32_intrlv80_4x32( vdata32, pdata ); + shabal512_4way_init( &x21s_ctx.shabal ); + shabal512_4way_update( &x21s_ctx.shabal, vdata32, 64 ); + rintrlv_4x32_4x64( vdata, vdata32, 640 ); + break; + case WHIRLPOOL: + mm128_bswap32_80( edata, pdata ); + sph_whirlpool_init( &x21s_ctx.whirlpool ); + sph_whirlpool( &x21s_ctx.whirlpool, edata, 64 ); + intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); + break; + default: + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + } + + *noncev = mm256_intrlv_blend_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - *noncev = mm256_intrlv_blend_32( mm256_bswap_32( - _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - x21s_4way_hash( hash, vdata ); - pdata[19] = n; - - for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) - if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) + for ( int i = 0; i < 4; i++ ) + if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { - pdata[19] = n+i; + pdata[19] = bswap_32( n+i ); submit_lane_solution( work, hash+(i<<3), mythr, i ); } + *noncev = _mm256_add_epi32( *noncev, + m256_const1_64( 0x0000000400000000 ) ); n += 4; - } while ( ( n < max_nonce ) && !(*restart) ); - - *hashes_done = n - first_nonce + 1; + } while ( ( n < last_nonce ) && !(*restart) ); + pdata[19] = n; + *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index d2bbdd0..e796321 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -310,10 +310,10 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce, x17_8way_hash( hash, vdata ); for ( int lane = 0; lane < 8; lane++ ) - if unlikely( ( hash7[ lane ] <= Htarg ) && !bench ) + if ( unlikely( ( hash7[ lane ] <= Htarg ) && !bench ) ) { extr_lane_8x32( lane_hash, hash, lane, 256 ); - if likely( valid_hash( lane_hash, ptarget ) ) + if ( likely( valid_hash( lane_hash, ptarget ) ) ) { pdata[19] = bswap_32( n + lane ); submit_lane_solution( work, lane_hash, mythr, lane ); @@ -323,7 +323,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce, m512_const1_64( 0x0000000800000000 ) ); n += 8; } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) ); - + pdata[19] = n; *hashes_done = n - first_nonce; return 0; } diff --git a/algo/x17/x17.c b/algo/x17/x17.c index bb29850..95c30a3 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -71,9 +71,7 @@ void x17_hash(void *output, const void *input) sph_bmw512_close(&ctx.bmw, hash); #if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + groestl512_full( &ctx.groestl, (char*)hash, (const char*)hash, 512 ); #else sph_groestl512_init( &ctx.groestl ); sph_groestl512( &ctx.groestl, hash, 64 ); @@ -92,14 +90,11 @@ void x17_hash(void *output, const void *input) sph_keccak512(&ctx.keccak, (const void*) hash, 64); sph_keccak512_close(&ctx.keccak, hash); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); + luffa_full( &ctx.luffa, (BitSequence*)hash, 512, + (const BitSequence*)hash, 64 ); // 8 Cube - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hash, 64 ); + cubehash_full( &ctx.cube, (byte*) hash, 512, (const byte*)hash, 64 ); // 9 Shavite sph_shavite512_init( &ctx.shavite ); @@ -107,15 +102,13 @@ void x17_hash(void *output, const void *input) sph_shavite512_close( &ctx.shavite, hash); // 10 Simd - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence*)hash, + simd_full( &ctx.simd, (BitSequence*)hash, (const BitSequence*)hash, 512 ); //11---echo--- #if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence*)hash, - (const BitSequence*)hash, 512 ); + echo_full( &ctx.echo, (BitSequence *)hash, 512, + (const BitSequence *)hash, 64 ); #else sph_echo512_init( &ctx.echo ); sph_echo512( &ctx.echo, hash, 64 ); @@ -161,28 +154,8 @@ int scanhash_x17( struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19] - 1; const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; int thr_id = mythr->id; // thr_id arg is deprecated - uint64_t htmax[] = - { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = - { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - // we need bigendian data... casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); @@ -190,23 +163,14 @@ int scanhash_x17( struct work *work, uint32_t max_nonce, casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - for ( int m = 0; m < 6; m++ ) + do { - if ( Htarg <= htmax[m] ) - { - uint32_t mask = masks[m]; - do - { - pdata[19] = ++n; - be32enc( &endiandata[19], n ); - x17_hash( hash64, endiandata ); - if ( !( hash64[7] & mask ) ) - if ( fulltest( hash64, ptarget ) && !opt_benchmark ) - submit_solution( work, hash64, mythr ); - } while ( n < max_nonce && !work_restart[thr_id].restart); - break; - } - } + pdata[19] = ++n; + be32enc( &endiandata[19], n ); + x17_hash( hash64, endiandata ); + if unlikely( valid_hash( hash64, ptarget ) && !opt_benchmark ) + submit_solution( work, hash64, mythr ); + } while ( n < max_nonce && !work_restart[thr_id].restart); *hashes_done = n - first_nonce + 1; pdata[19] = n; return 0; diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c index 5b9e2be..c080763 100644 --- a/algo/yespower/yescrypt-r8g.c +++ b/algo/yespower/yescrypt-r8g.c @@ -73,7 +73,8 @@ bool register_yescryptr8g_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower_r8g; gate->hash = (void*)&yespower_tls; - opt_target_factor = 65536.0; + opt_sapling = true; + opt_target_factor = 65536.0; return true; }; diff --git a/configure b/configure index 3dce50f..3f7e8f2 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.7. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.8. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.11.7' -PACKAGE_STRING='cpuminer-opt 3.11.7' +PACKAGE_VERSION='3.11.8' +PACKAGE_STRING='cpuminer-opt 3.11.8' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.11.7 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.11.8 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.11.7:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.11.8:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.11.7 +cpuminer-opt configure 3.11.8 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.11.7, which was +It was created by cpuminer-opt $as_me 3.11.8, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.11.7' + VERSION='3.11.8' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.11.7, which was +This file was extended by cpuminer-opt $as_me 3.11.8, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.11.7 +cpuminer-opt config.status 3.11.8 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 2bf3d8e..5d8771d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.11.7]) +AC_INIT([cpuminer-opt], [3.11.8]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index b6edca2..7e1f094 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -110,6 +110,7 @@ int opt_param_r = 0; int opt_pluck_n = 128; int opt_n_threads = 0; bool opt_reset_on_stale = false; +bool opt_sapling = false; // Windows doesn't support 128 bit affinity mask. // Need compile time and run time test. @@ -551,10 +552,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work ) goto out; } version = (uint32_t) json_integer_value( tmp ); - if ( version == 5 ) + + // yescryptr8g uses block version 5 and sapling. + if ( opt_sapling ) work->sapling = true; - else if ( version > 4 ) -// if ( (version & 0xffU) > BLOCK_VERSION_CURRENT ) + if ( (version & 0xffU) > BLOCK_VERSION_CURRENT ) { if ( version_reduce ) version = ( version & ~0xffU ) | BLOCK_VERSION_CURRENT; @@ -1057,7 +1059,7 @@ static int share_result( int result, struct work *null_work, } // calculate latency and share time. - if ( my_stats.submit_time.tv_sec ) + if likely( my_stats.submit_time.tv_sec ) { gettimeofday( &ack_time, NULL ); timeval_subtract( &latency_tv, &ack_time, &my_stats.submit_time ); @@ -1075,7 +1077,8 @@ static int share_result( int result, struct work *null_work, if ( likely( result ) ) { accepted_share_count++; - if ( ( my_stats.net_diff > 0. ) && ( my_stats.share_diff >= net_diff ) ) + if unlikely( ( my_stats.net_diff > 0. ) + && ( my_stats.share_diff >= net_diff ) ) { solved = true; solved_block_count++; @@ -1106,17 +1109,14 @@ static int share_result( int result, struct work *null_work, } else { - if ( stale ) - stale_sum++; - else - reject_sum++; + if ( stale ) stale_sum++; + else reject_sum++; } submit_sum++; latency_sum += latency; pthread_mutex_unlock( &stats_lock ); - bcol = acol = scol = rcol = "\0"; if ( likely( result ) ) { if ( unlikely( solved ) ) @@ -1148,25 +1148,19 @@ static int share_result( int result, struct work *null_work, } } - bcol = acol = scol = rcol = CL_WHT; - if ( use_colors ) { + bcol = acol = scol = rcol = CL_WHT; if ( likely( result ) ) { - if ( unlikely( solved ) ) - { - bcol = CL_MAG; - acol = CL_GRN; - } - else - acol = CL_GRN; + acol = CL_GRN; + if ( unlikely( solved ) ) bcol = CL_MAG; } - else if ( stale ) - scol = CL_YL2; - else - rcol = CL_RED; + else if ( stale ) scol = CL_YL2; + else rcol = CL_RED; } + else + bcol = acol = scol = rcol = "\0"; applog( LOG_NOTICE, "%d %s%s %s%s %s%s %s%s" CL_WHT ", %.3f sec (%dms)", my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol, @@ -1180,31 +1174,29 @@ static int share_result( int result, struct work *null_work, if ( unlikely( reason && !result ) ) { if ( !( opt_quiet || stale ) ) + { applog( LOG_WARNING, "Reject reason: %s", reason ); - if ( opt_debug ) - { uint32_t str1[8], str2[8]; char str3[65]; // display share hash and target for troubleshooting - diff_to_target( (uint64_t*)str1, my_stats.share_diff ); + diff_to_target( str1, my_stats.share_diff ); for ( int i = 0; i < 8; i++ ) be32enc( str2 + i, str1[7 - i] ); bin2hex( str3, (unsigned char*)str2, 12 ); - applog2( LOG_INFO, "Hash: %s...", str3 ); + applog2( LOG_INFO, "Share diff: %g, Hash: %s...", my_stats.share_diff, str3 ); - diff_to_target( (uint64_t*)str1, my_stats.target_diff ); + diff_to_target( str1, my_stats.target_diff ); for ( int i = 0; i < 8; i++ ) be32enc( str2 + i, str1[7 - i] ); bin2hex( str3, (unsigned char*)str2, 12 ); - applog2( LOG_INFO, "Target: %s...", str3 ); + applog2( LOG_INFO, "Target diff: %g, Targ: %s...", str3 ); } if ( unlikely( opt_reset_on_stale && stale ) ) stratum_need_reset = true; } - return 1; } @@ -1265,7 +1257,7 @@ bool std_le_submit_getwork_result( CURL *curl, struct work *work ) for ( int i = 0; i < data_size / sizeof(uint32_t); i++ ) le32enc( &work->data[i], work->data[i] ); gw_str = abin2hex( (uchar*)work->data, data_size ); - if ( unlikely(!gw_str) ) + if ( unlikely( !gw_str ) ) { applog(LOG_ERR, "submit_upstream_work OOM"); return false; @@ -1299,7 +1291,7 @@ bool std_be_submit_getwork_result( CURL *curl, struct work *work ) for ( int i = 0; i < data_size / sizeof(uint32_t); i++ ) be32enc( &work->data[i], work->data[i] ); gw_str = abin2hex( (uchar*)work->data, data_size ); - if ( unlikely(!gw_str) ) + if ( unlikely( !gw_str ) ) { applog(LOG_ERR, "submit_upstream_work OOM"); return false; @@ -1755,7 +1747,7 @@ static bool get_work(struct thr_info *thr, struct work *work) struct workio_cmd *wc; struct work *work_heap; - if (opt_benchmark) + if unlikely( opt_benchmark ) { uint32_t ts = (uint32_t) time(NULL); @@ -2020,8 +2012,8 @@ void std_get_new_work( struct work* work, struct work* g_work, int thr_id, work_free( work ); work_copy( work, g_work ); *nonceptr = 0xffffffffU / opt_n_threads * thr_id; - if ( opt_randomize ) - *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; +// if ( opt_randomize ) +// *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; } else @@ -2214,7 +2206,7 @@ static void *miner_thread( void *userdata ) continue; } // adjust max_nonce to meet target scan time - if (have_stratum) + if ( have_stratum ) max64 = LP_SCANTIME; else max64 = g_work_time + ( have_longpoll ? LP_SCANTIME : opt_scantime ) @@ -2294,7 +2286,7 @@ static void *miner_thread( void *userdata ) // prevent stale work in solo // we can't submit twice a block! - if ( !have_stratum && !have_longpoll ) + if unlikely( !have_stratum && !have_longpoll ) { pthread_mutex_lock( &g_work_lock ); // will force getwork @@ -2598,20 +2590,17 @@ void std_build_block_header( struct work* g_work, uint32_t version, memset( g_work->data, 0, sizeof(g_work->data) ); g_work->data[0] = version; - g_work->sapling = be32dec( &version ) == 5 ? true : false; + g_work->sapling = opt_sapling; - if ( have_stratum ) - for ( i = 0; i < 8; i++ ) + if ( have_stratum ) for ( i = 0; i < 8; i++ ) g_work->data[ 1+i ] = le32dec( prevhash + i ); - else - for (i = 0; i < 8; i++) + else for (i = 0; i < 8; i++) g_work->data[ 8-i ] = le32dec( prevhash + i ); - for ( i = 0; i < 8; i++ ) g_work->data[ 9+i ] = be32dec( merkle_tree + i ); - g_work->data[ algo_gate.ntime_index ] = ntime; g_work->data[ algo_gate.nbits_index ] = nbits; + if ( g_work->sapling ) { if ( have_stratum ) @@ -2653,7 +2642,6 @@ void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) { pthread_mutex_lock( &sctx->work_lock ); - free( g_work->job_id ); g_work->job_id = strdup( sctx->job.job_id ); g_work->xnonce2_len = sctx->xnonce2_size; @@ -2690,7 +2678,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) else if ( last_block_height != sctx->block_height ) applog( LOG_BLUE, "New block %d, job %s", sctx->block_height, g_work->job_id ); - else + else if ( g_work->job_id ) applog( LOG_BLUE,"New job %s", g_work->job_id ); // Update data and calculate new estimates. @@ -2710,6 +2698,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) applog2( LOG_INFO, "%s: %s", algo_names[opt_algo], short_url ); applog2( LOG_INFO, "Diff: Net %.3g, Stratum %.3g, Target %.3g", net_diff, stratum_diff, last_targetdiff ); + if ( likely( hr > 0. ) ) { char hr_units[4] = {0}; @@ -2719,26 +2708,25 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) sprintf_et( block_ttf, net_diff * diff_to_hash / hr ); sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr ); scale_hash_for_display ( &hr, hr_units ); - applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s", hr, hr_units, block_ttf, share_ttf ); - if ( !multipool && net_diff > 0. ) + + if ( !multipool && last_block_height > session_first_block ) { struct timeval now, et; gettimeofday( &now, NULL ); timeval_subtract( &et, &now, &session_start ); - double net_hr = net_diff * diff_to_hash; - char net_ttf[32]; + uint64_t net_ttf = + ( last_block_height - session_first_block ) == 0 ? 0 + : et.tv_sec / ( last_block_height - session_first_block ); + double net_hr = net_diff * diff_to_hash / net_ttf; + char net_ttf_str[32]; char net_hr_units[4] = {0}; - sprintf_et( net_ttf, - ( last_block_height - session_first_block ) == 0 ? 0 : - et.tv_sec / ( last_block_height - session_first_block ) ); - + sprintf_et( net_ttf_str, net_ttf ); scale_hash_for_display ( &net_hr, net_hr_units ); - - applog2( LOG_INFO, "TTF @ %.2f %sh/s: %s", - net_hr, net_hr_units, net_ttf ); + applog2( LOG_INFO, "Net TTF @ %.2f %sh/s: %s", + net_hr, net_hr_units, net_ttf_str ); } } // hr > 0 } // !quiet diff --git a/miner.h b/miner.h index dbb3006..9629ee3 100644 --- a/miner.h +++ b/miner.h @@ -317,7 +317,7 @@ bool valid_hash( const void*, const void* ); void work_set_target( struct work* work, double diff ); double target_to_diff( uint32_t* target ); -extern void diff_to_target( uint64_t *target, double diff ); +extern void diff_to_target( uint32_t *target, double diff ); double hash_target_ratio( uint32_t* hash, uint32_t* target ); void work_set_target_ratio( struct work* work, const void *hash ); @@ -779,7 +779,7 @@ extern pthread_mutex_t rpc2_job_lock; extern pthread_mutex_t rpc2_login_lock; extern pthread_mutex_t applog_lock; extern pthread_mutex_t stats_lock; - +extern bool opt_sapling; static char const usage[] = "\ Usage: " PACKAGE_NAME " [OPTIONS]\n\ diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 4ad8df4..0ca4f95 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -567,6 +567,20 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1, } } +#if defined(__SSSE3__) + +static inline void mm128_bswap32_80( void *d, void *s ) +{ + __m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); + casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf ); + casti_m128i( d, 1 ) = _mm_shuffle_epi8( casti_m128i( s, 1 ), bswap_shuf ); + casti_m128i( d, 2 ) = _mm_shuffle_epi8( casti_m128i( s, 2 ), bswap_shuf ); + casti_m128i( d, 3 ) = _mm_shuffle_epi8( casti_m128i( s, 3 ), bswap_shuf ); + casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf ); +} + +#endif + static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) { __m128i s0 = casti_m128i( src,0 ); @@ -2106,6 +2120,7 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src, RLEAVE_4x64_4x32( 48 ); RLEAVE_4x64_4x32( 56 ); if ( bit_len <= 512 ) return; RLEAVE_4x64_4x32( 64 ); RLEAVE_4x64_4x32( 72 ); + if ( bit_len <= 640 ) return; RLEAVE_4x64_4x32( 80 ); RLEAVE_4x64_4x32( 88 ); RLEAVE_4x64_4x32( 96 ); RLEAVE_4x64_4x32( 104 ); RLEAVE_4x64_4x32( 112 ); RLEAVE_4x64_4x32( 120 ); @@ -2140,6 +2155,9 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src, if ( bit_len <= 512 ) return; RLEAVE_8x64_8x32( 128 ); RLEAVE_8x64_8x32( 144 ); + + if ( bit_len <= 640 ) return; + RLEAVE_8x64_8x32( 160 ); RLEAVE_8x64_8x32( 176 ); RLEAVE_8x64_8x32( 192 ); RLEAVE_8x64_8x32( 208 ); RLEAVE_8x64_8x32( 224 ); RLEAVE_8x64_8x32( 240 ); @@ -2255,6 +2273,8 @@ static inline void rintrlv_8x32_8x64( void *dst, d[38] = _mm_unpacklo_epi32( s[37], s[39] ); d[39] = _mm_unpackhi_epi32( s[37], s[39] ); + if ( bit_len <= 640 ) return; + d[40] = _mm_unpacklo_epi32( s[40], s[42] ); d[41] = _mm_unpackhi_epi32( s[40], s[42] ); d[42] = _mm_unpacklo_epi32( s[41], s[43] ); @@ -2319,7 +2339,9 @@ static inline void rintrlv_8x32_4x128( void *dst0, void *dst1, if ( bit_len <= 256 ) return; RLEAVE_8X32_4X128( 32 ); RLEAVE_8X32_4X128( 48 ); if ( bit_len <= 512 ) return; - RLEAVE_8X32_4X128( 64 ); RLEAVE_8X32_4X128( 80 ); + RLEAVE_8X32_4X128( 64 ); + if ( bit_len <= 640 ) return; + RLEAVE_8X32_4X128( 80 ); RLEAVE_8X32_4X128( 96 ); RLEAVE_8X32_4X128( 112 ); } #undef RLEAVE_8X32_4X128 @@ -2383,6 +2405,7 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0, d[17] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] ); d[18] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] ); d[19] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] ); + if ( bit_len <= 640 ) return; d[20] = _mm_unpacklo_epi64( s0[10], s0[11] ); d[21] = _mm_unpacklo_epi64( s1[10], s1[11] ); d[22] = _mm_unpackhi_epi64( s0[10], s0[11] ); @@ -2453,6 +2476,7 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1, d0[ 9] = _mm_unpackhi_epi64( s[16], s[18] ); d1[ 8] = _mm_unpacklo_epi64( s[17], s[19] ); d1[ 9] = _mm_unpackhi_epi64( s[17], s[19] ); + if ( bit_len <= 640 ) return; d0[10] = _mm_unpacklo_epi64( s[20], s[22] ); d0[11] = _mm_unpackhi_epi64( s[20], s[22] ); d1[10] = _mm_unpacklo_epi64( s[21], s[23] ); @@ -2549,6 +2573,8 @@ static inline void rintrlv_4x128_8x64( void *dst, const void *src0, d[38] = _mm_unpackhi_epi64( s1[16], s1[17] ); d[39] = _mm_unpackhi_epi64( s1[18], s1[19] ); + if ( bit_len <= 640 ) return; + d[40] = _mm_unpacklo_epi64( s0[20], s0[21] ); d[41] = _mm_unpacklo_epi64( s0[22], s0[23] ); d[42] = _mm_unpacklo_epi64( s1[20], s1[21] ); @@ -2635,6 +2661,8 @@ static inline void rintrlv_8x64_4x128( void *dst0, void *dst1, d1[18] = _mm_unpacklo_epi64( s[35], s[39] ); d1[19] = _mm_unpackhi_epi64( s[35], s[39] ); + if ( bit_len <= 640 ) return; + d0[20] = _mm_unpacklo_epi64( s[40], s[44] ); d0[21] = _mm_unpackhi_epi64( s[40], s[44] ); d1[20] = _mm_unpacklo_epi64( s[42], s[46] ); @@ -2723,6 +2751,8 @@ static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2, d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] ); d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] ); + if ( bit_len <= 640 ) return; + d0[10] = _mm_unpacklo_epi64( s[40], s[44] ); d1[10] = _mm_unpackhi_epi64( s[40], s[44] ); d2[10] = _mm_unpacklo_epi64( s[41], s[45] ); @@ -2811,6 +2841,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0, d[38] = _mm_unpackhi_epi64( s2[8], s2[10] ); d[39] = _mm_unpackhi_epi64( s3[8], s3[10] ); + if ( bit_len <= 640 ) return; + d[40] = _mm_unpacklo_epi64( s0[9], s0[11] ); d[41] = _mm_unpacklo_epi64( s1[9], s1[11] ); d[42] = _mm_unpacklo_epi64( s2[9], s2[11] ); diff --git a/util.c b/util.c index d701389..635be39 100644 --- a/util.c +++ b/util.c @@ -1038,7 +1038,7 @@ bool fulltest( const uint32_t *hash, const uint32_t *target ) return rc; } -void diff_to_target(uint64_t *target, double diff) +void diff_to_target(uint32_t *target, double diff) { uint64_t m; int k; @@ -1055,7 +1055,7 @@ void diff_to_target(uint64_t *target, double diff) else { memset( target, 0, 32 ); - target[k] = m; + ((uint64_t*)target)[k] = m; // target[k] = (uint32_t)m; // target[k + 1] = (uint32_t)(m >> 32); } @@ -1064,7 +1064,7 @@ void diff_to_target(uint64_t *target, double diff) // Only used by stratum pools void work_set_target(struct work* work, double diff) { - diff_to_target( (uint64_t*)work->target, diff ); + diff_to_target( work->target, diff ); work->targetdiff = diff; } @@ -1574,8 +1574,9 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p goto out; if (!socket_full(sctx->sock, 3)) { - if (opt_debug) - applog(LOG_DEBUG, "stratum extranonce subscribe timed out"); + applog(LOG_WARNING, "stratum extranonce subscribe timed out"); +// if (opt_debug) +// applog(LOG_DEBUG, "stratum extranonce subscribe timed out"); goto out; } @@ -1590,7 +1591,7 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p if (!stratum_handle_method(sctx, sret)) applog(LOG_WARNING, "Stratum answer id is not correct!"); } -// res_val = json_object_get(extra, "result"); + res_val = json_object_get(extra, "result"); // if (opt_debug && (!res_val || json_is_false(res_val))) // applog(LOG_DEBUG, "extranonce subscribe not supported"); json_decref(extra); @@ -1898,13 +1899,13 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) } hex2bin( sctx->job.version, version, 4 ); - int ver = be32dec( sctx->job.version ); - if ( ver == 5 ) + + if ( opt_sapling ) { finalsaplinghash = json_string_value( json_array_get( params, 9 ) ); if ( !finalsaplinghash || strlen(finalsaplinghash) != 64 ) { - applog( LOG_ERR, "Stratum notify: invalid version 5 parameters" ); + applog( LOG_ERR, "Stratum notify: invalid sapling parameters" ); goto out; } } @@ -1957,7 +1958,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) hex2bin( sctx->job.prevhash, prevhash, 32 ); if ( has_claim ) hex2bin( sctx->job.extra, extradata, 32 ); if ( has_roots ) hex2bin( sctx->job.extra, extradata, 64 ); - if ( ver == 5 ) + if ( opt_sapling ) hex2bin( sctx->job.final_sapling_hash, finalsaplinghash, 32 ); if ( is_veil )