mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
1264 lines
45 KiB
C
1264 lines
45 KiB
C
/*
|
|
* file : grostl_bitsliced_mm.c
|
|
* version : 1.0.208
|
|
* date : 14.12.2010
|
|
*
|
|
* - multi-stream bitsliced implementation of hash function Grostl
|
|
* - implements NIST hash api
|
|
* - assumes that message lenght is multiple of 8-bits
|
|
* - _GROSTL_BITSLICED_MM_ must be defined if compiling with ../main.c
|
|
*
|
|
* Cagdas Calik
|
|
* ccalik@metu.edu.tr
|
|
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
|
*
|
|
*/
|
|
|
|
#include "grss_api.h"
|
|
#include "bitsliceaes.h"
|
|
|
|
|
|
MYALIGN const unsigned int _transpose1[] = {0x060e070f, 0x040c050d, 0x020a030b, 0x00080109};
|
|
MYALIGN const unsigned int _hiqmask[] = {0x00000000, 0x00000000, 0xffffffff, 0xffffffff};
|
|
MYALIGN const unsigned int _loqmask[] = {0xffffffff, 0xffffffff, 0x00000000, 0x00000000};
|
|
MYALIGN const unsigned int _invmask[] = {0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203};
|
|
|
|
|
|
|
|
|
|
#define TRANSPOSE(m, u, v)\
|
|
u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\
|
|
u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\
|
|
u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\
|
|
u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\
|
|
v[0] = _mm_unpacklo_epi16(u[3], u[2]);\
|
|
v[1] = _mm_unpacklo_epi16(u[1], u[0]);\
|
|
v[2] = _mm_unpackhi_epi16(u[3], u[2]);\
|
|
v[3] = _mm_unpackhi_epi16(u[1], u[0]);\
|
|
m[0] = _mm_unpackhi_epi32(v[2], v[3]);\
|
|
m[1] = _mm_unpacklo_epi32(v[2], v[3]);\
|
|
m[2] = _mm_unpackhi_epi32(v[0], v[1]);\
|
|
m[3] = _mm_unpacklo_epi32(v[0], v[1])
|
|
|
|
#define TRANSPOSE_BACK(m, u, v)\
|
|
u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\
|
|
u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\
|
|
u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\
|
|
u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\
|
|
v[0] = _mm_unpacklo_epi16(u[0], u[1]);\
|
|
v[1] = _mm_unpacklo_epi16(u[2], u[3]);\
|
|
v[2] = _mm_unpackhi_epi16(u[0], u[1]);\
|
|
v[3] = _mm_unpackhi_epi16(u[2], u[3]);\
|
|
m[0] = _mm_unpacklo_epi32(v[0], v[1]);\
|
|
m[1] = _mm_unpackhi_epi32(v[0], v[1]);\
|
|
m[2] = _mm_unpacklo_epi32(v[2], v[3]);\
|
|
m[3] = _mm_unpackhi_epi32(v[2], v[3])
|
|
|
|
|
|
void Init256(grssState *pctx)
|
|
{
|
|
unsigned int i;
|
|
__m128i t;
|
|
|
|
pctx->state1[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
|
|
|
|
pctx->state2[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
|
|
|
|
pctx->state3[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
|
|
|
|
pctx->state4[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
|
|
|
|
|
|
for(i = 0; i < 10; i++)
|
|
{
|
|
pctx->_Pconst[i][0] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][1] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][2] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][3] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][4] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Pconst[i][5] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Pconst[i][6] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Pconst[i][7] = _mm_set_epi32(0, 0, 0, 0);
|
|
|
|
|
|
pctx->_Qconst[i][0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Qconst[i][1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Qconst[i][2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Qconst[i][3] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Qconst[i][4] = _mm_set_epi32(0, 0, (~i) << 24, 0);
|
|
pctx->_Qconst[i][5] = _mm_set_epi32(0, 0, (~i) << 24, 0);
|
|
pctx->_Qconst[i][6] = _mm_set_epi32(0, 0, (~i) << 24, 0);
|
|
pctx->_Qconst[i][7] = _mm_set_epi32(0, 0, (~i) << 24, 0);
|
|
|
|
BITSLICE(pctx->_Pconst[i][0], pctx->_Pconst[i][1], pctx->_Pconst[i][2], pctx->_Pconst[i][3], pctx->_Pconst[i][4], pctx->_Pconst[i][5], pctx->_Pconst[i][6], pctx->_Pconst[i][7], t);
|
|
BITSLICE(pctx->_Qconst[i][0], pctx->_Qconst[i][1], pctx->_Qconst[i][2], pctx->_Qconst[i][3], pctx->_Qconst[i][4], pctx->_Qconst[i][5], pctx->_Qconst[i][6], pctx->_Qconst[i][7], t);
|
|
}
|
|
|
|
pctx->_shiftconst[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x06050403, 0x02010007);
|
|
pctx->_shiftconst[1] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x04030201, 0x00070605);
|
|
pctx->_shiftconst[2] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x02010007, 0x06050403);
|
|
pctx->_shiftconst[3] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x00070605, 0x04030201);
|
|
}
|
|
|
|
|
|
void Init512(grssState *pctx)
|
|
{
|
|
unsigned int i;
|
|
__m128i t;
|
|
|
|
pctx->state1[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[3] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[4] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[5] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[6] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state1[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
|
|
|
|
pctx->state2[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[3] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[4] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[5] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[6] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state2[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
|
|
|
|
pctx->state3[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[3] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[4] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[5] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[6] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state3[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
|
|
|
|
pctx->state4[0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[3] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[4] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[5] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[6] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->state4[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
|
|
|
|
for(i = 0; i < 14; i++)
|
|
{
|
|
pctx->_Pconst[i][0] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][1] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][2] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][3] = _mm_set_epi32(i << 24, 0, 0, 0);
|
|
pctx->_Pconst[i][4] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Pconst[i][5] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Pconst[i][6] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Pconst[i][7] = _mm_set_epi32(0, 0, 0, 0);
|
|
|
|
|
|
pctx->_Qconst[i][4] = _mm_set_epi32((~i) << 24, 0, 0, 0);
|
|
pctx->_Qconst[i][5] = _mm_set_epi32((~i) << 24, 0, 0, 0);
|
|
pctx->_Qconst[i][6] = _mm_set_epi32((~i) << 24, 0, 0, 0);
|
|
pctx->_Qconst[i][7] = _mm_set_epi32((~i) << 24, 0, 0, 0);
|
|
pctx->_Qconst[i][0] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Qconst[i][1] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Qconst[i][2] = _mm_set_epi32(0, 0, 0, 0);
|
|
pctx->_Qconst[i][3] = _mm_set_epi32(0, 0, 0, 0);
|
|
|
|
BITSLICE(pctx->_Pconst[i][0], pctx->_Pconst[i][1], pctx->_Pconst[i][2], pctx->_Pconst[i][3], pctx->_Pconst[i][4], pctx->_Pconst[i][5], pctx->_Pconst[i][6], pctx->_Pconst[i][7], t);
|
|
BITSLICE(pctx->_Qconst[i][0], pctx->_Qconst[i][1], pctx->_Qconst[i][2], pctx->_Qconst[i][3], pctx->_Qconst[i][4], pctx->_Qconst[i][5], pctx->_Qconst[i][6], pctx->_Qconst[i][7], t);
|
|
}
|
|
|
|
pctx->_shiftconst[1] = _mm_set_epi32(0x0e0d0c0b, 0x0a090807, 0x06050403, 0x0201000f);
|
|
pctx->_shiftconst[2] = _mm_set_epi32(0x0d0c0b0a, 0x09080706, 0x05040302, 0x01000f0e);
|
|
pctx->_shiftconst[3] = _mm_set_epi32(0x0c0b0a09, 0x08070605, 0x04030201, 0x000f0e0d);
|
|
pctx->_shiftconst[4] = _mm_set_epi32(0x0b0a0908, 0x07060504, 0x03020100, 0x0f0e0d0c);
|
|
pctx->_shiftconst[5] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);
|
|
pctx->_shiftconst[6] = _mm_set_epi32(0x09080706, 0x05040302, 0x01000f0e, 0x0d0c0b0a);
|
|
pctx->_shiftconst[7] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);
|
|
}
|
|
|
|
|
|
#define MUL_BITSLICE_2(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
|
|
x[7] = _mm_xor_si128(x[7], b6[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b5[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b4[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b3[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b7[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b2[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b7[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b1[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b0[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b7[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b7[i])
|
|
|
|
#define MUL_BITSLICE_3(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
|
|
x[7] = _mm_xor_si128(x[7], b6[i]);\
|
|
x[7] = _mm_xor_si128(x[7], b7[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b5[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b6[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b4[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b5[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b3[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b4[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b7[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b2[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b3[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b7[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b1[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b2[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b0[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b1[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b7[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b0[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b7[i])
|
|
|
|
#define MUL_BITSLICE_4(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
|
|
x[7] = _mm_xor_si128(x[7], b5[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b4[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b3[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b7[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b2[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b6[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b7[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b1[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b6[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b0[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b7[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b6[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b7[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b6[i])
|
|
|
|
|
|
#define MUL_BITSLICE_5(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
|
|
x[7] = _mm_xor_si128(x[7], b5[i]);\
|
|
x[7] = _mm_xor_si128(x[7], b7[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b4[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b6[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b3[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b5[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b7[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b2[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b4[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b6[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b7[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b1[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b3[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b6[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b0[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b2[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b7[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b1[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b6[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b7[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b0[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b6[i])
|
|
|
|
#define MUL_BITSLICE_7(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
|
|
x[7] = _mm_xor_si128(x[7], b5[i]);\
|
|
x[7] = _mm_xor_si128(x[7], b6[i]);\
|
|
x[7] = _mm_xor_si128(x[7], b7[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b4[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b5[i]);\
|
|
x[6] = _mm_xor_si128(x[6], b6[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b3[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b4[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b5[i]);\
|
|
x[5] = _mm_xor_si128(x[5], b7[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b2[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b3[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b4[i]);\
|
|
x[4] = _mm_xor_si128(x[4], b6[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b1[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b2[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b3[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b6[i]);\
|
|
x[3] = _mm_xor_si128(x[3], b7[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b0[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b1[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b2[i]);\
|
|
x[2] = _mm_xor_si128(x[2], b7[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b0[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b1[i]);\
|
|
x[1] = _mm_xor_si128(x[1], b6[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b0[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b6[i]);\
|
|
x[0] = _mm_xor_si128(x[0], b7[i])
|
|
|
|
|
|
|
|
|
|
#define ROW_L2L(x) _mm_and_si128(x, M128(_hiqmask))
|
|
#define ROW_L2R(x) _mm_srli_si128(x, 8)
|
|
#define ROW_R2L(x) _mm_slli_si128(x, 8)
|
|
#define ROW_R2R(x) _mm_and_si128(x, M128(_loqmask))
|
|
|
|
#define ROW_MOV_EO ROW_L2R
|
|
#define ROW_MOV_EE ROW_L2L
|
|
#define ROW_MOV_OE ROW_R2L
|
|
#define ROW_MOV_OO ROW_R2R
|
|
|
|
#define MUL_BITSLICE256_2(x, rm, i)\
|
|
x[7] = _mm_xor_si128(x[7], rm(p2[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p3[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p4[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(q1[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q2[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(p1[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q3[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(q4[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(p1[i]))
|
|
|
|
#define MUL_BITSLICE256_3(x, rm, i)\
|
|
x[7] = _mm_xor_si128(x[7], rm(p2[i]));\
|
|
x[7] = _mm_xor_si128(x[7], rm(p1[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p3[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p2[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p4[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p3[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(q1[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p4[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q2[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q1[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(p1[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q2[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q3[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(q4[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(q3[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(q4[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(p1[i]))
|
|
|
|
|
|
#define MUL_BITSLICE256_4(x, rm, i)\
|
|
x[7] = _mm_xor_si128(x[7], rm(p3[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p4[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(q1[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p1[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(q2[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p2[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q3[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(p2[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q4[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(p1[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(p2[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(p2[i]))
|
|
|
|
#define MUL_BITSLICE256_5(x, rm, i)\
|
|
x[7] = _mm_xor_si128(x[7], rm(p3[i]));\
|
|
x[7] = _mm_xor_si128(x[7], rm(p1[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p4[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p2[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(q1[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p3[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p1[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(q2[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p4[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p2[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q3[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q1[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(p2[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q4[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q2[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(p1[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(q3[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(p2[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(q4[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(p2[i]))
|
|
|
|
|
|
#define MUL_BITSLICE256_7(x, rm, i)\
|
|
x[7] = _mm_xor_si128(x[7], rm(p3[i]));\
|
|
x[7] = _mm_xor_si128(x[7], rm(p2[i]));\
|
|
x[7] = _mm_xor_si128(x[7], rm(p1[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p4[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p3[i]));\
|
|
x[6] = _mm_xor_si128(x[6], rm(p2[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(q1[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p4[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p3[i]));\
|
|
x[5] = _mm_xor_si128(x[5], rm(p1[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(q2[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(q1[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p4[i]));\
|
|
x[4] = _mm_xor_si128(x[4], rm(p2[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q3[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q2[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(q1[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(p2[i]));\
|
|
x[3] = _mm_xor_si128(x[3], rm(p1[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q4[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q3[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(q2[i]));\
|
|
x[2] = _mm_xor_si128(x[2], rm(p1[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(q4[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(q3[i]));\
|
|
x[1] = _mm_xor_si128(x[1], rm(p2[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(q4[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(p2[i]));\
|
|
x[0] = _mm_xor_si128(x[0], rm(p1[i]))
|
|
|
|
|
|
void Compress256(grssState *ctx,
|
|
const unsigned char *pmsg1, const unsigned char *pmsg2, const unsigned char *pmsg3, const unsigned char *pmsg4,
|
|
DataLength uBlockCount)
|
|
{
|
|
|
|
DataLength b;
|
|
unsigned int i, r;
|
|
__m128i x[8], t0, t1, t2, t3, t4, t5, t6, t7, u[4], u2[4];
|
|
__m128i p1[4], p2[4], p3[4], p4[4], q1[4], q2[4], q3[4], q4[4];
|
|
__m128i r1[8], r2[8], r3[8], r4[8], s1[8], s2[8], s3[8], s4[8];
|
|
__m128i x01[8], x23[8], x45[8], x67[8];
|
|
__m128i x0[8], x1[8], x2[8], x3[8], x4[8], x5[8], x6[8], x7[8];
|
|
|
|
for(i = 0; i < 8; i++)
|
|
x[i] = _mm_xor_si128(x[i], x[i]);
|
|
|
|
// transpose cv
|
|
TRANSPOSE(ctx->state1, u, u2);
|
|
TRANSPOSE(ctx->state2, u, u2);
|
|
TRANSPOSE(ctx->state3, u, u2);
|
|
TRANSPOSE(ctx->state4, u, u2);
|
|
|
|
for(b = 0; b < uBlockCount; b++)
|
|
{
|
|
q1[0] = _mm_loadu_si128((__m128i*)pmsg1 + 0);
|
|
q1[1] = _mm_loadu_si128((__m128i*)pmsg1 + 1);
|
|
q1[2] = _mm_loadu_si128((__m128i*)pmsg1 + 2);
|
|
q1[3] = _mm_loadu_si128((__m128i*)pmsg1 + 3);
|
|
q2[0] = _mm_loadu_si128((__m128i*)pmsg2 + 0);
|
|
q2[1] = _mm_loadu_si128((__m128i*)pmsg2 + 1);
|
|
q2[2] = _mm_loadu_si128((__m128i*)pmsg2 + 2);
|
|
q2[3] = _mm_loadu_si128((__m128i*)pmsg2 + 3);
|
|
q3[0] = _mm_loadu_si128((__m128i*)pmsg3 + 0);
|
|
q3[1] = _mm_loadu_si128((__m128i*)pmsg3 + 1);
|
|
q3[2] = _mm_loadu_si128((__m128i*)pmsg3 + 2);
|
|
q3[3] = _mm_loadu_si128((__m128i*)pmsg3 + 3);
|
|
q4[0] = _mm_loadu_si128((__m128i*)pmsg4 + 0);
|
|
q4[1] = _mm_loadu_si128((__m128i*)pmsg4 + 1);
|
|
q4[2] = _mm_loadu_si128((__m128i*)pmsg4 + 2);
|
|
q4[3] = _mm_loadu_si128((__m128i*)pmsg4 + 3);
|
|
|
|
// transpose message
|
|
TRANSPOSE(q1, u, u2);
|
|
TRANSPOSE(q2, u, u2);
|
|
TRANSPOSE(q3, u, u2);
|
|
TRANSPOSE(q4, u, u2);
|
|
|
|
// xor cv and message
|
|
for(i = 0; i < 4; i++)
|
|
{
|
|
p1[i] = _mm_xor_si128(ctx->state1[i], q1[i]);
|
|
p2[i] = _mm_xor_si128(ctx->state2[i], q2[i]);
|
|
p3[i] = _mm_xor_si128(ctx->state3[i], q3[i]);
|
|
p4[i] = _mm_xor_si128(ctx->state4[i], q4[i]);
|
|
}
|
|
|
|
|
|
BITSLICE(p1[0], p2[0], p3[0], p4[0], q1[0], q2[0], q3[0], q4[0], t0);
|
|
BITSLICE(p1[1], p2[1], p3[1], p4[1], q1[1], q2[1], q3[1], q4[1], t0);
|
|
BITSLICE(p1[2], p2[2], p3[2], p4[2], q1[2], q2[2], q3[2], q4[2], t0);
|
|
BITSLICE(p1[3], p2[3], p3[3], p4[3], q1[3], q2[3], q3[3], q4[3], t0);
|
|
|
|
for(r = 0; r < 10; r++)
|
|
{
|
|
// Add const
|
|
p1[0] = _mm_xor_si128(p1[0], ctx->_Pconst[r][0]);
|
|
p2[0] = _mm_xor_si128(p2[0], ctx->_Pconst[r][1]);
|
|
p3[0] = _mm_xor_si128(p3[0], ctx->_Pconst[r][2]);
|
|
p4[0] = _mm_xor_si128(p4[0], ctx->_Pconst[r][3]);
|
|
q1[0] = _mm_xor_si128(q1[0], ctx->_Pconst[r][4]);
|
|
q2[0] = _mm_xor_si128(q2[0], ctx->_Pconst[r][5]);
|
|
q3[0] = _mm_xor_si128(q3[0], ctx->_Pconst[r][6]);
|
|
q4[0] = _mm_xor_si128(q4[0], ctx->_Pconst[r][7]);
|
|
|
|
p1[3] = _mm_xor_si128(p1[3], ctx->_Qconst[r][0]);
|
|
p2[3] = _mm_xor_si128(p2[3], ctx->_Qconst[r][1]);
|
|
p3[3] = _mm_xor_si128(p3[3], ctx->_Qconst[r][2]);
|
|
p4[3] = _mm_xor_si128(p4[3], ctx->_Qconst[r][3]);
|
|
q1[3] = _mm_xor_si128(q1[3], ctx->_Qconst[r][4]);
|
|
q2[3] = _mm_xor_si128(q2[3], ctx->_Qconst[r][5]);
|
|
q3[3] = _mm_xor_si128(q3[3], ctx->_Qconst[r][6]);
|
|
q4[3] = _mm_xor_si128(q4[3], ctx->_Qconst[r][7]);
|
|
|
|
// Sub bytes
|
|
SUBSTITUTE_BITSLICE(q4[0], q3[0], q2[0], q1[0], p4[0], p3[0], p2[0], p1[0], t0, t1, t2, t3, t4, t5, t6, t7);
|
|
SUBSTITUTE_BITSLICE(q4[1], q3[1], q2[1], q1[1], p4[1], p3[1], p2[1], p1[1], t0, t1, t2, t3, t4, t5, t6, t7);
|
|
SUBSTITUTE_BITSLICE(q4[2], q3[2], q2[2], q1[2], p4[2], p3[2], p2[2], p1[2], t0, t1, t2, t3, t4, t5, t6, t7);
|
|
SUBSTITUTE_BITSLICE(q4[3], q3[3], q2[3], q1[3], p4[3], p3[3], p2[3], p1[3], t0, t1, t2, t3, t4, t5, t6, t7);
|
|
|
|
// Shift bytes
|
|
p1[0] = _mm_shuffle_epi8(p1[0], ctx->_shiftconst[0]);
|
|
p2[0] = _mm_shuffle_epi8(p2[0], ctx->_shiftconst[0]);
|
|
p3[0] = _mm_shuffle_epi8(p3[0], ctx->_shiftconst[0]);
|
|
p4[0] = _mm_shuffle_epi8(p4[0], ctx->_shiftconst[0]);
|
|
q1[0] = _mm_shuffle_epi8(q1[0], ctx->_shiftconst[0]);
|
|
q2[0] = _mm_shuffle_epi8(q2[0], ctx->_shiftconst[0]);
|
|
q3[0] = _mm_shuffle_epi8(q3[0], ctx->_shiftconst[0]);
|
|
q4[0] = _mm_shuffle_epi8(q4[0], ctx->_shiftconst[0]);
|
|
|
|
p1[1] = _mm_shuffle_epi8(p1[1], ctx->_shiftconst[1]);
|
|
p2[1] = _mm_shuffle_epi8(p2[1], ctx->_shiftconst[1]);
|
|
p3[1] = _mm_shuffle_epi8(p3[1], ctx->_shiftconst[1]);
|
|
p4[1] = _mm_shuffle_epi8(p4[1], ctx->_shiftconst[1]);
|
|
q1[1] = _mm_shuffle_epi8(q1[1], ctx->_shiftconst[1]);
|
|
q2[1] = _mm_shuffle_epi8(q2[1], ctx->_shiftconst[1]);
|
|
q3[1] = _mm_shuffle_epi8(q3[1], ctx->_shiftconst[1]);
|
|
q4[1] = _mm_shuffle_epi8(q4[1], ctx->_shiftconst[1]);
|
|
|
|
p1[2] = _mm_shuffle_epi8(p1[2], ctx->_shiftconst[2]);
|
|
p2[2] = _mm_shuffle_epi8(p2[2], ctx->_shiftconst[2]);
|
|
p3[2] = _mm_shuffle_epi8(p3[2], ctx->_shiftconst[2]);
|
|
p4[2] = _mm_shuffle_epi8(p4[2], ctx->_shiftconst[2]);
|
|
q1[2] = _mm_shuffle_epi8(q1[2], ctx->_shiftconst[2]);
|
|
q2[2] = _mm_shuffle_epi8(q2[2], ctx->_shiftconst[2]);
|
|
q3[2] = _mm_shuffle_epi8(q3[2], ctx->_shiftconst[2]);
|
|
q4[2] = _mm_shuffle_epi8(q4[2], ctx->_shiftconst[2]);
|
|
|
|
p1[3] = _mm_shuffle_epi8(p1[3], ctx->_shiftconst[3]);
|
|
p2[3] = _mm_shuffle_epi8(p2[3], ctx->_shiftconst[3]);
|
|
p3[3] = _mm_shuffle_epi8(p3[3], ctx->_shiftconst[3]);
|
|
p4[3] = _mm_shuffle_epi8(p4[3], ctx->_shiftconst[3]);
|
|
q1[3] = _mm_shuffle_epi8(q1[3], ctx->_shiftconst[3]);
|
|
q2[3] = _mm_shuffle_epi8(q2[3], ctx->_shiftconst[3]);
|
|
q3[3] = _mm_shuffle_epi8(q3[3], ctx->_shiftconst[3]);
|
|
q4[3] = _mm_shuffle_epi8(q4[3], ctx->_shiftconst[3]);
|
|
|
|
// Mix bytes
|
|
#if 0
|
|
for(i = 0; i < 4; i++)
|
|
{
|
|
r1[2 * i + 0] = _mm_srli_si128(p1[i], 8);
|
|
r1[2 * i + 1] = _mm_and_si128(p1[i], M128(_loqmask));
|
|
r2[2 * i + 0] = _mm_srli_si128(p2[i], 8);
|
|
r2[2 * i + 1] = _mm_and_si128(p2[i], M128(_loqmask));
|
|
r3[2 * i + 0] = _mm_srli_si128(p3[i], 8);
|
|
r3[2 * i + 1] = _mm_and_si128(p3[i], M128(_loqmask));
|
|
r4[2 * i + 0] = _mm_srli_si128(p4[i], 8);
|
|
r4[2 * i + 1] = _mm_and_si128(p4[i], M128(_loqmask));
|
|
|
|
s1[2 * i + 0] = _mm_srli_si128(q1[i], 8);
|
|
s1[2 * i + 1] = _mm_and_si128(q1[i], M128(_loqmask));
|
|
s2[2 * i + 0] = _mm_srli_si128(q2[i], 8);
|
|
s2[2 * i + 1] = _mm_and_si128(q2[i], M128(_loqmask));
|
|
s3[2 * i + 0] = _mm_srli_si128(q3[i], 8);
|
|
s3[2 * i + 1] = _mm_and_si128(q3[i], M128(_loqmask));
|
|
s4[2 * i + 0] = _mm_srli_si128(q4[i], 8);
|
|
s4[2 * i + 1] = _mm_and_si128(q4[i], M128(_loqmask));
|
|
|
|
}
|
|
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
x0[i] = _mm_xor_si128(x0[i], x0[i]);
|
|
x1[i] = _mm_xor_si128(x1[i], x1[i]);
|
|
x2[i] = _mm_xor_si128(x2[i], x2[i]);
|
|
x3[i] = _mm_xor_si128(x3[i], x3[i]);
|
|
x4[i] = _mm_xor_si128(x4[i], x4[i]);
|
|
x5[i] = _mm_xor_si128(x5[i], x5[i]);
|
|
x6[i] = _mm_xor_si128(x6[i], x6[i]);
|
|
x7[i] = _mm_xor_si128(x7[i], x7[i]);
|
|
}
|
|
|
|
MUL_BITSLICE_2(x0, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x0, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x0, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x0, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x0, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x0, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x0, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x0, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
MUL_BITSLICE_2(x1, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x1, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x1, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x1, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x1, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x1, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x1, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x1, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
MUL_BITSLICE_2(x2, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x2, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x2, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x2, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x2, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x2, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x2, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x2, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
MUL_BITSLICE_2(x3, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x3, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x3, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x3, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x3, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x3, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x3, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x3, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
MUL_BITSLICE_2(x4, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x4, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x4, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x4, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x4, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x4, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x4, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x4, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
MUL_BITSLICE_2(x5, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x5, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x5, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x5, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x5, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x5, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x5, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x5, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
MUL_BITSLICE_2(x6, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x6, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x6, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x6, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x6, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x6, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x6, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x6, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
MUL_BITSLICE_2(x7, 7, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_2(x7, 0, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x7, 1, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_4(x7, 2, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x7, 3, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_3(x7, 4, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_5(x7, 5, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
MUL_BITSLICE_7(x7, 6, r1, r2, r3, r4, s1, s2, s3, s4);
|
|
|
|
|
|
p1[0] = _mm_unpacklo_epi64(x1[7], x0[7]);
|
|
p2[0] = _mm_unpacklo_epi64(x1[6], x0[6]);
|
|
p3[0] = _mm_unpacklo_epi64(x1[5], x0[5]);
|
|
p4[0] = _mm_unpacklo_epi64(x1[4], x0[4]);
|
|
q1[0] = _mm_unpacklo_epi64(x1[3], x0[3]);
|
|
q2[0] = _mm_unpacklo_epi64(x1[2], x0[2]);
|
|
q3[0] = _mm_unpacklo_epi64(x1[1], x0[1]);
|
|
q4[0] = _mm_unpacklo_epi64(x1[0], x0[0]);
|
|
|
|
p1[1] = _mm_unpacklo_epi64(x3[7], x2[7]);
|
|
p2[1] = _mm_unpacklo_epi64(x3[6], x2[6]);
|
|
p3[1] = _mm_unpacklo_epi64(x3[5], x2[5]);
|
|
p4[1] = _mm_unpacklo_epi64(x3[4], x2[4]);
|
|
q1[1] = _mm_unpacklo_epi64(x3[3], x2[3]);
|
|
q2[1] = _mm_unpacklo_epi64(x3[2], x2[2]);
|
|
q3[1] = _mm_unpacklo_epi64(x3[1], x2[1]);
|
|
q4[1] = _mm_unpacklo_epi64(x3[0], x2[0]);
|
|
|
|
p1[2] = _mm_unpacklo_epi64(x5[7], x4[7]);
|
|
p2[2] = _mm_unpacklo_epi64(x5[6], x4[6]);
|
|
p3[2] = _mm_unpacklo_epi64(x5[5], x4[5]);
|
|
p4[2] = _mm_unpacklo_epi64(x5[4], x4[4]);
|
|
q1[2] = _mm_unpacklo_epi64(x5[3], x4[3]);
|
|
q2[2] = _mm_unpacklo_epi64(x5[2], x4[2]);
|
|
q3[2] = _mm_unpacklo_epi64(x5[1], x4[1]);
|
|
q4[2] = _mm_unpacklo_epi64(x5[0], x4[0]);
|
|
|
|
p1[3] = _mm_unpacklo_epi64(x7[7], x6[7]);
|
|
p2[3] = _mm_unpacklo_epi64(x7[6], x6[6]);
|
|
p3[3] = _mm_unpacklo_epi64(x7[5], x6[5]);
|
|
p4[3] = _mm_unpacklo_epi64(x7[4], x6[4]);
|
|
q1[3] = _mm_unpacklo_epi64(x7[3], x6[3]);
|
|
q2[3] = _mm_unpacklo_epi64(x7[2], x6[2]);
|
|
q3[3] = _mm_unpacklo_epi64(x7[1], x6[1]);
|
|
q4[3] = _mm_unpacklo_epi64(x7[0], x6[0]);
|
|
|
|
#else
|
|
|
|
for(i = 0; i < 8; i ++)
|
|
{
|
|
x01[i] = _mm_xor_si128(x01[i], x01[i]);
|
|
x23[i] = _mm_xor_si128(x23[i], x23[i]);
|
|
x45[i] = _mm_xor_si128(x45[i], x45[i]);
|
|
x67[i] = _mm_xor_si128(x67[i], x67[i]);
|
|
}
|
|
|
|
// row 1
|
|
MUL_BITSLICE256_2(x01, ROW_MOV_EE, 0);
|
|
MUL_BITSLICE256_2(x01, ROW_MOV_OE, 0);
|
|
MUL_BITSLICE256_3(x01, ROW_MOV_EE, 1);
|
|
MUL_BITSLICE256_4(x01, ROW_MOV_OE, 1);
|
|
MUL_BITSLICE256_5(x01, ROW_MOV_EE, 2);
|
|
MUL_BITSLICE256_3(x01, ROW_MOV_OE, 2);
|
|
MUL_BITSLICE256_5(x01, ROW_MOV_EE, 3);
|
|
MUL_BITSLICE256_7(x01, ROW_MOV_OE, 3);
|
|
|
|
// row2
|
|
MUL_BITSLICE256_7(x01, ROW_MOV_EO, 0);
|
|
MUL_BITSLICE256_2(x01, ROW_MOV_OO, 0);
|
|
MUL_BITSLICE256_2(x01, ROW_MOV_EO, 1);
|
|
MUL_BITSLICE256_3(x01, ROW_MOV_OO, 1);
|
|
MUL_BITSLICE256_4(x01, ROW_MOV_EO, 2);
|
|
MUL_BITSLICE256_5(x01, ROW_MOV_OO, 2);
|
|
MUL_BITSLICE256_3(x01, ROW_MOV_EO, 3);
|
|
MUL_BITSLICE256_5(x01, ROW_MOV_OO, 3);
|
|
|
|
// row 3
|
|
MUL_BITSLICE256_5(x23, ROW_MOV_EE, 0);
|
|
MUL_BITSLICE256_7(x23, ROW_MOV_OE, 0);
|
|
MUL_BITSLICE256_2(x23, ROW_MOV_EE, 1);
|
|
MUL_BITSLICE256_2(x23, ROW_MOV_OE, 1);
|
|
MUL_BITSLICE256_3(x23, ROW_MOV_EE, 2);
|
|
MUL_BITSLICE256_4(x23, ROW_MOV_OE, 2);
|
|
MUL_BITSLICE256_5(x23, ROW_MOV_EE, 3);
|
|
MUL_BITSLICE256_3(x23, ROW_MOV_OE, 3);
|
|
|
|
// row 4
|
|
MUL_BITSLICE256_3(x23, ROW_MOV_EO, 0);
|
|
MUL_BITSLICE256_5(x23, ROW_MOV_OO, 0);
|
|
MUL_BITSLICE256_7(x23, ROW_MOV_EO, 1);
|
|
MUL_BITSLICE256_2(x23, ROW_MOV_OO, 1);
|
|
MUL_BITSLICE256_2(x23, ROW_MOV_EO, 2);
|
|
MUL_BITSLICE256_3(x23, ROW_MOV_OO, 2);
|
|
MUL_BITSLICE256_4(x23, ROW_MOV_EO, 3);
|
|
MUL_BITSLICE256_5(x23, ROW_MOV_OO, 3);
|
|
|
|
// row 5
|
|
MUL_BITSLICE256_5(x45, ROW_MOV_EE, 0);
|
|
MUL_BITSLICE256_3(x45, ROW_MOV_OE, 0);
|
|
MUL_BITSLICE256_5(x45, ROW_MOV_EE, 1);
|
|
MUL_BITSLICE256_7(x45, ROW_MOV_OE, 1);
|
|
MUL_BITSLICE256_2(x45, ROW_MOV_EE, 2);
|
|
MUL_BITSLICE256_2(x45, ROW_MOV_OE, 2);
|
|
MUL_BITSLICE256_3(x45, ROW_MOV_EE, 3);
|
|
MUL_BITSLICE256_4(x45, ROW_MOV_OE, 3);
|
|
|
|
// row 6
|
|
MUL_BITSLICE256_4(x45, ROW_MOV_EO, 0);
|
|
MUL_BITSLICE256_5(x45, ROW_MOV_OO, 0);
|
|
MUL_BITSLICE256_3(x45, ROW_MOV_EO, 1);
|
|
MUL_BITSLICE256_5(x45, ROW_MOV_OO, 1);
|
|
MUL_BITSLICE256_7(x45, ROW_MOV_EO, 2);
|
|
MUL_BITSLICE256_2(x45, ROW_MOV_OO, 2);
|
|
MUL_BITSLICE256_2(x45, ROW_MOV_EO, 3);
|
|
MUL_BITSLICE256_3(x45, ROW_MOV_OO, 3);
|
|
|
|
// row 7
|
|
MUL_BITSLICE256_3(x67, ROW_MOV_EE, 0);
|
|
MUL_BITSLICE256_4(x67, ROW_MOV_OE, 0);
|
|
MUL_BITSLICE256_5(x67, ROW_MOV_EE, 1);
|
|
MUL_BITSLICE256_3(x67, ROW_MOV_OE, 1);
|
|
MUL_BITSLICE256_5(x67, ROW_MOV_EE, 2);
|
|
MUL_BITSLICE256_7(x67, ROW_MOV_OE, 2);
|
|
MUL_BITSLICE256_2(x67, ROW_MOV_EE, 3);
|
|
MUL_BITSLICE256_2(x67, ROW_MOV_OE, 3);
|
|
|
|
// row 8
|
|
MUL_BITSLICE256_2(x67, ROW_MOV_EO, 0);
|
|
MUL_BITSLICE256_3(x67, ROW_MOV_OO, 0);
|
|
MUL_BITSLICE256_4(x67, ROW_MOV_EO, 1);
|
|
MUL_BITSLICE256_5(x67, ROW_MOV_OO, 1);
|
|
MUL_BITSLICE256_3(x67, ROW_MOV_EO, 2);
|
|
MUL_BITSLICE256_5(x67, ROW_MOV_OO, 2);
|
|
MUL_BITSLICE256_7(x67, ROW_MOV_EO, 3);
|
|
MUL_BITSLICE256_2(x67, ROW_MOV_OO, 3);
|
|
|
|
p1[0] = x01[7];
|
|
p2[0] = x01[6];
|
|
p3[0] = x01[5];
|
|
p4[0] = x01[4];
|
|
q1[0] = x01[3];
|
|
q2[0] = x01[2];
|
|
q3[0] = x01[1];
|
|
q4[0] = x01[0];
|
|
|
|
p1[1] = x23[7];
|
|
p2[1] = x23[6];
|
|
p3[1] = x23[5];
|
|
p4[1] = x23[4];
|
|
q1[1] = x23[3];
|
|
q2[1] = x23[2];
|
|
q3[1] = x23[1];
|
|
q4[1] = x23[0];
|
|
|
|
p1[2] = x45[7];
|
|
p2[2] = x45[6];
|
|
p3[2] = x45[5];
|
|
p4[2] = x45[4];
|
|
q1[2] = x45[3];
|
|
q2[2] = x45[2];
|
|
q3[2] = x45[1];
|
|
q4[2] = x45[0];
|
|
|
|
p1[3] = x67[7];
|
|
p2[3] = x67[6];
|
|
p3[3] = x67[5];
|
|
p4[3] = x67[4];
|
|
q1[3] = x67[3];
|
|
q2[3] = x67[2];
|
|
q3[3] = x67[1];
|
|
q4[3] = x67[0];
|
|
#endif
|
|
}
|
|
|
|
BITSLICE(p1[0], p2[0], p3[0], p4[0], q1[0], q2[0], q3[0], q4[0], t0);
|
|
BITSLICE(p1[1], p2[1], p3[1], p4[1], q1[1], q2[1], q3[1], q4[1], t0);
|
|
BITSLICE(p1[2], p2[2], p3[2], p4[2], q1[2], q2[2], q3[2], q4[2], t0);
|
|
BITSLICE(p1[3], p2[3], p3[3], p4[3], q1[3], q2[3], q3[3], q4[3], t0);
|
|
|
|
// P ^ Q
|
|
for(i = 0; i < 4; i++)
|
|
{
|
|
ctx->state1[i] = _mm_xor_si128(ctx->state1[i], _mm_xor_si128(p1[i], q1[i]));
|
|
ctx->state2[i] = _mm_xor_si128(ctx->state2[i], _mm_xor_si128(p2[i], q2[i]));
|
|
ctx->state3[i] = _mm_xor_si128(ctx->state3[i], _mm_xor_si128(p3[i], q3[i]));
|
|
ctx->state4[i] = _mm_xor_si128(ctx->state4[i], _mm_xor_si128(p4[i], q4[i]));
|
|
}
|
|
|
|
pmsg1 += 64;
|
|
pmsg2 += 64;
|
|
pmsg3 += 64;
|
|
pmsg4 += 64;
|
|
}
|
|
|
|
// transpose state back
|
|
TRANSPOSE_BACK(ctx->state1, u, u2);
|
|
TRANSPOSE_BACK(ctx->state2, u, u2);
|
|
TRANSPOSE_BACK(ctx->state3, u, u2);
|
|
TRANSPOSE_BACK(ctx->state4, u, u2);
|
|
}
|
|
|
|
#define TRANSPOSE512(m, u, v)\
|
|
u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\
|
|
u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\
|
|
u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\
|
|
u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\
|
|
u[4] = _mm_shuffle_epi8(m[4], M128(_transpose1));\
|
|
u[5] = _mm_shuffle_epi8(m[5], M128(_transpose1));\
|
|
u[6] = _mm_shuffle_epi8(m[6], M128(_transpose1));\
|
|
u[7] = _mm_shuffle_epi8(m[7], M128(_transpose1));\
|
|
v[0] = _mm_unpacklo_epi16(u[7], u[6]);\
|
|
v[1] = _mm_unpacklo_epi16(u[5], u[4]);\
|
|
v[2] = _mm_unpacklo_epi16(u[3], u[2]);\
|
|
v[3] = _mm_unpacklo_epi16(u[1], u[0]);\
|
|
v[4] = _mm_unpackhi_epi16(u[7], u[6]);\
|
|
v[5] = _mm_unpackhi_epi16(u[5], u[4]);\
|
|
v[6] = _mm_unpackhi_epi16(u[3], u[2]);\
|
|
v[7] = _mm_unpackhi_epi16(u[1], u[0]);\
|
|
u[0] = _mm_unpackhi_epi32(v[6], v[7]);\
|
|
u[1] = _mm_unpacklo_epi32(v[6], v[7]);\
|
|
u[2] = _mm_unpackhi_epi32(v[4], v[5]);\
|
|
u[3] = _mm_unpacklo_epi32(v[4], v[5]);\
|
|
u[4] = _mm_unpackhi_epi32(v[2], v[3]);\
|
|
u[5] = _mm_unpacklo_epi32(v[2], v[3]);\
|
|
u[6] = _mm_unpackhi_epi32(v[0], v[1]);\
|
|
u[7] = _mm_unpacklo_epi32(v[0], v[1]);\
|
|
m[0] = _mm_unpackhi_epi64(u[2], u[0]);\
|
|
m[1] = _mm_unpacklo_epi64(u[2], u[0]);\
|
|
m[2] = _mm_unpackhi_epi64(u[3], u[1]);\
|
|
m[3] = _mm_unpacklo_epi64(u[3], u[1]);\
|
|
m[4] = _mm_unpackhi_epi64(u[6], u[4]);\
|
|
m[5] = _mm_unpacklo_epi64(u[6], u[4]);\
|
|
m[6] = _mm_unpackhi_epi64(u[7], u[5]);\
|
|
m[7] = _mm_unpacklo_epi64(u[7], u[5])
|
|
|
|
|
|
#define TRANSPOSE512_BACK(m, u, v)\
|
|
u[0] = _mm_shuffle_epi8(m[0], M128(_invmask));\
|
|
u[1] = _mm_shuffle_epi8(m[1], M128(_invmask));\
|
|
u[2] = _mm_shuffle_epi8(m[2], M128(_invmask));\
|
|
u[3] = _mm_shuffle_epi8(m[3], M128(_invmask));\
|
|
u[4] = _mm_shuffle_epi8(m[4], M128(_invmask));\
|
|
u[5] = _mm_shuffle_epi8(m[5], M128(_invmask));\
|
|
u[6] = _mm_shuffle_epi8(m[6], M128(_invmask));\
|
|
u[7] = _mm_shuffle_epi8(m[7], M128(_invmask));\
|
|
v[0] = _mm_unpacklo_epi8(u[0], u[1]);\
|
|
v[1] = _mm_unpacklo_epi8(u[2], u[3]);\
|
|
v[2] = _mm_unpacklo_epi8(u[4], u[5]);\
|
|
v[3] = _mm_unpacklo_epi8(u[6], u[7]);\
|
|
v[4] = _mm_unpackhi_epi8(u[0], u[1]);\
|
|
v[5] = _mm_unpackhi_epi8(u[2], u[3]);\
|
|
v[6] = _mm_unpackhi_epi8(u[4], u[5]);\
|
|
v[7] = _mm_unpackhi_epi8(u[6], u[7]);\
|
|
u[0] = _mm_unpacklo_epi16(v[0], v[1]);\
|
|
u[1] = _mm_unpacklo_epi16(v[2], v[3]);\
|
|
u[2] = _mm_unpacklo_epi16(v[4], v[5]);\
|
|
u[3] = _mm_unpacklo_epi16(v[6], v[7]);\
|
|
u[4] = _mm_unpackhi_epi16(v[0], v[1]);\
|
|
u[5] = _mm_unpackhi_epi16(v[2], v[3]);\
|
|
u[6] = _mm_unpackhi_epi16(v[4], v[5]);\
|
|
u[7] = _mm_unpackhi_epi16(v[6], v[7]);\
|
|
m[0] = _mm_unpacklo_epi32(u[0], u[1]);\
|
|
m[1] = _mm_unpackhi_epi32(u[0], u[1]);\
|
|
m[2] = _mm_unpacklo_epi32(u[4], u[5]);\
|
|
m[3] = _mm_unpackhi_epi32(u[4], u[5]);\
|
|
m[4] = _mm_unpacklo_epi32(u[2], u[3]);\
|
|
m[5] = _mm_unpackhi_epi32(u[2], u[3]);\
|
|
m[6] = _mm_unpacklo_epi32(u[6], u[7]);\
|
|
m[7] = _mm_unpackhi_epi32(u[6], u[7])
|
|
|
|
|
|
void Compress512(grssState *ctx,
|
|
const unsigned char *pmsg1, const unsigned char *pmsg2, const unsigned char *pmsg3, const unsigned char *pmsg4,
|
|
DataLength uBlockCount)
|
|
{
|
|
|
|
__m128i u[8], v[8], p1[8], p2[8], p3[8], p4[8], q1[8], q2[8], q3[8], q4[8], t;
|
|
__m128i t0, t1, t2, t3, s0, s1, s2, s3;
|
|
__m128i x0[8], x1[8], x2[8], x3[8], x4[8], x5[8], x6[8], x7[8];
|
|
DataLength b;
|
|
unsigned int i, r;
|
|
|
|
// transpose cv
|
|
TRANSPOSE512(ctx->state1, u, v);
|
|
TRANSPOSE512(ctx->state2, u, v);
|
|
TRANSPOSE512(ctx->state3, u, v);
|
|
TRANSPOSE512(ctx->state4, u, v);
|
|
|
|
for(b = 0; b < uBlockCount; b++)
|
|
{
|
|
// load message
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
q1[i] = _mm_loadu_si128((__m128i*)pmsg1 + i);
|
|
q2[i] = _mm_loadu_si128((__m128i*)pmsg2 + i);
|
|
q3[i] = _mm_loadu_si128((__m128i*)pmsg3 + i);
|
|
q4[i] = _mm_loadu_si128((__m128i*)pmsg4 + i);
|
|
}
|
|
|
|
// transpose message
|
|
TRANSPOSE512(q1, u, v);
|
|
TRANSPOSE512(q2, u, v);
|
|
TRANSPOSE512(q3, u, v);
|
|
TRANSPOSE512(q4, u, v);
|
|
|
|
// xor cv and message
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
p1[i] = _mm_xor_si128(ctx->state1[i], q1[i]);
|
|
p2[i] = _mm_xor_si128(ctx->state2[i], q2[i]);
|
|
p3[i] = _mm_xor_si128(ctx->state3[i], q3[i]);
|
|
p4[i] = _mm_xor_si128(ctx->state4[i], q4[i]);
|
|
}
|
|
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
BITSLICE(p1[i], p2[i], p3[i], p4[i], q1[i], q2[i], q3[i], q4[i], t);
|
|
}
|
|
|
|
for(r = 0; r < 14; r++)
|
|
{
|
|
// add constant
|
|
p1[0] = _mm_xor_si128(p1[0], ctx->_Pconst[r][0]);
|
|
p2[0] = _mm_xor_si128(p2[0], ctx->_Pconst[r][1]);
|
|
p3[0] = _mm_xor_si128(p3[0], ctx->_Pconst[r][2]);
|
|
p4[0] = _mm_xor_si128(p4[0], ctx->_Pconst[r][3]);
|
|
q1[0] = _mm_xor_si128(q1[0], ctx->_Pconst[r][4]);
|
|
q2[0] = _mm_xor_si128(q2[0], ctx->_Pconst[r][5]);
|
|
q3[0] = _mm_xor_si128(q3[0], ctx->_Pconst[r][6]);
|
|
q4[0] = _mm_xor_si128(q4[0], ctx->_Pconst[r][7]);
|
|
|
|
p1[7] = _mm_xor_si128(p1[7], ctx->_Qconst[r][0]);
|
|
p2[7] = _mm_xor_si128(p2[7], ctx->_Qconst[r][1]);
|
|
p3[7] = _mm_xor_si128(p3[7], ctx->_Qconst[r][2]);
|
|
p4[7] = _mm_xor_si128(p4[7], ctx->_Qconst[r][3]);
|
|
q1[7] = _mm_xor_si128(q1[7], ctx->_Qconst[r][4]);
|
|
q2[7] = _mm_xor_si128(q2[7], ctx->_Qconst[r][5]);
|
|
q3[7] = _mm_xor_si128(q3[7], ctx->_Qconst[r][6]);
|
|
q4[7] = _mm_xor_si128(q4[7], ctx->_Qconst[r][7]);
|
|
|
|
// sub bytes
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
SUBSTITUTE_BITSLICE(q4[i], q3[i], q2[i], q1[i], p4[i], p3[i], p2[i], p1[i], t0, t1, t2, t3, s0, s1, s2, s3);
|
|
}
|
|
|
|
// shift bytes
|
|
for(i = 1; i < 8; i++)
|
|
{
|
|
p1[i] = _mm_shuffle_epi8(p1[i], ctx->_shiftconst[i]);
|
|
p2[i] = _mm_shuffle_epi8(p2[i], ctx->_shiftconst[i]);
|
|
p3[i] = _mm_shuffle_epi8(p3[i], ctx->_shiftconst[i]);
|
|
p4[i] = _mm_shuffle_epi8(p4[i], ctx->_shiftconst[i]);
|
|
|
|
q1[i] = _mm_shuffle_epi8(q1[i], ctx->_shiftconst[i]);
|
|
q2[i] = _mm_shuffle_epi8(q2[i], ctx->_shiftconst[i]);
|
|
q3[i] = _mm_shuffle_epi8(q3[i], ctx->_shiftconst[i]);
|
|
q4[i] = _mm_shuffle_epi8(q4[i], ctx->_shiftconst[i]);
|
|
}
|
|
|
|
// mix bytes
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
x0[i] = _mm_xor_si128(x0[i], x0[i]);
|
|
x1[i] = _mm_xor_si128(x1[i], x1[i]);
|
|
x2[i] = _mm_xor_si128(x2[i], x2[i]);
|
|
x3[i] = _mm_xor_si128(x3[i], x3[i]);
|
|
x4[i] = _mm_xor_si128(x4[i], x4[i]);
|
|
x5[i] = _mm_xor_si128(x5[i], x5[i]);
|
|
x6[i] = _mm_xor_si128(x6[i], x6[i]);
|
|
x7[i] = _mm_xor_si128(x7[i], x7[i]);
|
|
}
|
|
|
|
MUL_BITSLICE_2(x0, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x0, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x0, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x0, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x0, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x0, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x0, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x0, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
MUL_BITSLICE_2(x1, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x1, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x1, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x1, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x1, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x1, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x1, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x1, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
MUL_BITSLICE_2(x2, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x2, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x2, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x2, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x2, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x2, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x2, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x2, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
MUL_BITSLICE_2(x3, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x3, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x3, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x3, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x3, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x3, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x3, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x3, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
MUL_BITSLICE_2(x4, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x4, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x4, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x4, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x4, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x4, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x4, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x4, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
MUL_BITSLICE_2(x5, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x5, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x5, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x5, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x5, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x5, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x5, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x5, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
MUL_BITSLICE_2(x6, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x6, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x6, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x6, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x6, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x6, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x6, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x6, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
MUL_BITSLICE_2(x7, 7, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_2(x7, 0, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x7, 1, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_4(x7, 2, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x7, 3, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_3(x7, 4, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_5(x7, 5, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
MUL_BITSLICE_7(x7, 6, p1, p2, p3, p4, q1, q2, q3, q4);
|
|
|
|
|
|
p1[0] = x0[7];
|
|
p2[0] = x0[6];
|
|
p3[0] = x0[5];
|
|
p4[0] = x0[4];
|
|
q1[0] = x0[3];
|
|
q2[0] = x0[2];
|
|
q3[0] = x0[1];
|
|
q4[0] = x0[0];
|
|
|
|
p1[1] = x1[7];
|
|
p2[1] = x1[6];
|
|
p3[1] = x1[5];
|
|
p4[1] = x1[4];
|
|
q1[1] = x1[3];
|
|
q2[1] = x1[2];
|
|
q3[1] = x1[1];
|
|
q4[1] = x1[0];
|
|
|
|
p1[2] = x2[7];
|
|
p2[2] = x2[6];
|
|
p3[2] = x2[5];
|
|
p4[2] = x2[4];
|
|
q1[2] = x2[3];
|
|
q2[2] = x2[2];
|
|
q3[2] = x2[1];
|
|
q4[2] = x2[0];
|
|
|
|
p1[3] = x3[7];
|
|
p2[3] = x3[6];
|
|
p3[3] = x3[5];
|
|
p4[3] = x3[4];
|
|
q1[3] = x3[3];
|
|
q2[3] = x3[2];
|
|
q3[3] = x3[1];
|
|
q4[3] = x3[0];
|
|
|
|
p1[4] = x4[7];
|
|
p2[4] = x4[6];
|
|
p3[4] = x4[5];
|
|
p4[4] = x4[4];
|
|
q1[4] = x4[3];
|
|
q2[4] = x4[2];
|
|
q3[4] = x4[1];
|
|
q4[4] = x4[0];
|
|
|
|
p1[5] = x5[7];
|
|
p2[5] = x5[6];
|
|
p3[5] = x5[5];
|
|
p4[5] = x5[4];
|
|
q1[5] = x5[3];
|
|
q2[5] = x5[2];
|
|
q3[5] = x5[1];
|
|
q4[5] = x5[0];
|
|
|
|
p1[6] = x6[7];
|
|
p2[6] = x6[6];
|
|
p3[6] = x6[5];
|
|
p4[6] = x6[4];
|
|
q1[6] = x6[3];
|
|
q2[6] = x6[2];
|
|
q3[6] = x6[1];
|
|
q4[6] = x6[0];
|
|
|
|
p1[7] = x7[7];
|
|
p2[7] = x7[6];
|
|
p3[7] = x7[5];
|
|
p4[7] = x7[4];
|
|
q1[7] = x7[3];
|
|
q2[7] = x7[2];
|
|
q3[7] = x7[1];
|
|
q4[7] = x7[0];
|
|
|
|
}
|
|
|
|
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
BITSLICE(p1[i], p2[i], p3[i], p4[i], q1[i], q2[i], q3[i], q4[i], t);
|
|
}
|
|
|
|
|
|
for(i = 0; i < 8; i++)
|
|
{
|
|
ctx->state1[i] = _mm_xor_si128(ctx->state1[i], _mm_xor_si128(p1[i], q1[i]));
|
|
ctx->state2[i] = _mm_xor_si128(ctx->state2[i], _mm_xor_si128(p2[i], q2[i]));
|
|
ctx->state3[i] = _mm_xor_si128(ctx->state3[i], _mm_xor_si128(p3[i], q3[i]));
|
|
ctx->state4[i] = _mm_xor_si128(ctx->state4[i], _mm_xor_si128(p4[i], q4[i]));
|
|
}
|
|
}
|
|
|
|
TRANSPOSE512_BACK(ctx->state1, u, v);
|
|
TRANSPOSE512_BACK(ctx->state2, u, v);
|
|
TRANSPOSE512_BACK(ctx->state3, u, v);
|
|
TRANSPOSE512_BACK(ctx->state4, u, v);
|
|
}
|
|
|
|
|
|
|
|
void grssInit(grssState *pctx, int grssbitlen)
|
|
{
|
|
pctx->uHashLength = grssbitlen;
|
|
|
|
switch(grssbitlen)
|
|
{
|
|
case 256:
|
|
pctx->uBlockLength = 64;
|
|
Init256(pctx);
|
|
break;
|
|
|
|
case 512:
|
|
pctx->uBlockLength = 128;
|
|
Init512(pctx);
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void grssUpdate(grssState *state, const BitSequence *data, DataLength databitlen)
|
|
{
|
|
DataLength uByteLength, uBlockCount;
|
|
|
|
uByteLength = databitlen / 8;
|
|
|
|
uBlockCount = uByteLength / state->uBlockLength;
|
|
|
|
|
|
if(state->uHashLength == 256)
|
|
{
|
|
Compress256(state,
|
|
data + 0 * (uBlockCount / 4) * state->uBlockLength,
|
|
data + 1 * (uBlockCount / 4) * state->uBlockLength,
|
|
data + 2 * (uBlockCount / 4) * state->uBlockLength,
|
|
data + 3 * (uBlockCount / 4) * state->uBlockLength,
|
|
uBlockCount / 4);
|
|
}
|
|
else
|
|
{
|
|
Compress512(state,
|
|
data + 0 * (uBlockCount / 4) * state->uBlockLength,
|
|
data + 1 * (uBlockCount / 4) * state->uBlockLength,
|
|
data + 2 * (uBlockCount / 4) * state->uBlockLength,
|
|
data + 3 * (uBlockCount / 4) * state->uBlockLength,
|
|
1);
|
|
/*uBlockCount / 4); */
|
|
}
|
|
|
|
}
|
|
|
|
void grssFinal(grssState *state, BitSequence *grssval)
|
|
{
|
|
if(state->uHashLength == 256)
|
|
{
|
|
_mm_storeu_si128((__m128i*)grssval + 0, state->state1[0]);
|
|
_mm_storeu_si128((__m128i*)grssval + 1, state->state1[1]);
|
|
}
|
|
else
|
|
{
|
|
_mm_storeu_si128((__m128i*)grssval + 0, state->state1[0]);
|
|
_mm_storeu_si128((__m128i*)grssval + 1, state->state1[1]);
|
|
_mm_storeu_si128((__m128i*)grssval + 2, state->state1[2]);
|
|
_mm_storeu_si128((__m128i*)grssval + 3, state->state1[3]);
|
|
}
|
|
|
|
}
|
|
|
|
void Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
|
{
|
|
grssState hs;
|
|
grssInit(&hs, hashbitlen);
|
|
grssUpdate(&hs, data, databitlen);
|
|
grssFinal(&hs, hashval);
|
|
}
|
|
|