Files
cpuminer-opt-gpu/algo/groestl/sse2/grss.c
2016-09-22 13:16:18 -04:00

1264 lines
45 KiB
C

/*
* file : grostl_bitsliced_mm.c
* version : 1.0.208
* date : 14.12.2010
*
* - multi-stream bitsliced implementation of hash function Grostl
* - implements NIST hash api
* - assumes that message lenght is multiple of 8-bits
* - _GROSTL_BITSLICED_MM_ must be defined if compiling with ../main.c
*
* Cagdas Calik
* ccalik@metu.edu.tr
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#include "grss_api.h"
#include "bitsliceaes.h"
MYALIGN const unsigned int _transpose1[] = {0x060e070f, 0x040c050d, 0x020a030b, 0x00080109};
MYALIGN const unsigned int _hiqmask[] = {0x00000000, 0x00000000, 0xffffffff, 0xffffffff};
MYALIGN const unsigned int _loqmask[] = {0xffffffff, 0xffffffff, 0x00000000, 0x00000000};
MYALIGN const unsigned int _invmask[] = {0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203};
#define TRANSPOSE(m, u, v)\
u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\
u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\
u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\
u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\
v[0] = _mm_unpacklo_epi16(u[3], u[2]);\
v[1] = _mm_unpacklo_epi16(u[1], u[0]);\
v[2] = _mm_unpackhi_epi16(u[3], u[2]);\
v[3] = _mm_unpackhi_epi16(u[1], u[0]);\
m[0] = _mm_unpackhi_epi32(v[2], v[3]);\
m[1] = _mm_unpacklo_epi32(v[2], v[3]);\
m[2] = _mm_unpackhi_epi32(v[0], v[1]);\
m[3] = _mm_unpacklo_epi32(v[0], v[1])
#define TRANSPOSE_BACK(m, u, v)\
u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\
u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\
u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\
u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\
v[0] = _mm_unpacklo_epi16(u[0], u[1]);\
v[1] = _mm_unpacklo_epi16(u[2], u[3]);\
v[2] = _mm_unpackhi_epi16(u[0], u[1]);\
v[3] = _mm_unpackhi_epi16(u[2], u[3]);\
m[0] = _mm_unpacklo_epi32(v[0], v[1]);\
m[1] = _mm_unpackhi_epi32(v[0], v[1]);\
m[2] = _mm_unpacklo_epi32(v[2], v[3]);\
m[3] = _mm_unpackhi_epi32(v[2], v[3])
void Init256(grssState *pctx)
{
unsigned int i;
__m128i t;
pctx->state1[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
pctx->state2[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
pctx->state3[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
pctx->state4[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[3] = _mm_set_epi32(0x00010000, 0, 0, 0);
for(i = 0; i < 10; i++)
{
pctx->_Pconst[i][0] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][1] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][2] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][3] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][4] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Pconst[i][5] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Pconst[i][6] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Pconst[i][7] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][0] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][1] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][2] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][3] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][4] = _mm_set_epi32(0, 0, (~i) << 24, 0);
pctx->_Qconst[i][5] = _mm_set_epi32(0, 0, (~i) << 24, 0);
pctx->_Qconst[i][6] = _mm_set_epi32(0, 0, (~i) << 24, 0);
pctx->_Qconst[i][7] = _mm_set_epi32(0, 0, (~i) << 24, 0);
BITSLICE(pctx->_Pconst[i][0], pctx->_Pconst[i][1], pctx->_Pconst[i][2], pctx->_Pconst[i][3], pctx->_Pconst[i][4], pctx->_Pconst[i][5], pctx->_Pconst[i][6], pctx->_Pconst[i][7], t);
BITSLICE(pctx->_Qconst[i][0], pctx->_Qconst[i][1], pctx->_Qconst[i][2], pctx->_Qconst[i][3], pctx->_Qconst[i][4], pctx->_Qconst[i][5], pctx->_Qconst[i][6], pctx->_Qconst[i][7], t);
}
pctx->_shiftconst[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x06050403, 0x02010007);
pctx->_shiftconst[1] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x04030201, 0x00070605);
pctx->_shiftconst[2] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x02010007, 0x06050403);
pctx->_shiftconst[3] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x00070605, 0x04030201);
}
void Init512(grssState *pctx)
{
unsigned int i;
__m128i t;
pctx->state1[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[3] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[4] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[5] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[6] = _mm_set_epi32(0, 0, 0, 0);
pctx->state1[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
pctx->state2[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[3] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[4] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[5] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[6] = _mm_set_epi32(0, 0, 0, 0);
pctx->state2[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
pctx->state3[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[3] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[4] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[5] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[6] = _mm_set_epi32(0, 0, 0, 0);
pctx->state3[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
pctx->state4[0] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[1] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[2] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[3] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[4] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[5] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[6] = _mm_set_epi32(0, 0, 0, 0);
pctx->state4[7] = _mm_set_epi32(0x00020000, 0, 0, 0);
for(i = 0; i < 14; i++)
{
pctx->_Pconst[i][0] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][1] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][2] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][3] = _mm_set_epi32(i << 24, 0, 0, 0);
pctx->_Pconst[i][4] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Pconst[i][5] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Pconst[i][6] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Pconst[i][7] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][4] = _mm_set_epi32((~i) << 24, 0, 0, 0);
pctx->_Qconst[i][5] = _mm_set_epi32((~i) << 24, 0, 0, 0);
pctx->_Qconst[i][6] = _mm_set_epi32((~i) << 24, 0, 0, 0);
pctx->_Qconst[i][7] = _mm_set_epi32((~i) << 24, 0, 0, 0);
pctx->_Qconst[i][0] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][1] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][2] = _mm_set_epi32(0, 0, 0, 0);
pctx->_Qconst[i][3] = _mm_set_epi32(0, 0, 0, 0);
BITSLICE(pctx->_Pconst[i][0], pctx->_Pconst[i][1], pctx->_Pconst[i][2], pctx->_Pconst[i][3], pctx->_Pconst[i][4], pctx->_Pconst[i][5], pctx->_Pconst[i][6], pctx->_Pconst[i][7], t);
BITSLICE(pctx->_Qconst[i][0], pctx->_Qconst[i][1], pctx->_Qconst[i][2], pctx->_Qconst[i][3], pctx->_Qconst[i][4], pctx->_Qconst[i][5], pctx->_Qconst[i][6], pctx->_Qconst[i][7], t);
}
pctx->_shiftconst[1] = _mm_set_epi32(0x0e0d0c0b, 0x0a090807, 0x06050403, 0x0201000f);
pctx->_shiftconst[2] = _mm_set_epi32(0x0d0c0b0a, 0x09080706, 0x05040302, 0x01000f0e);
pctx->_shiftconst[3] = _mm_set_epi32(0x0c0b0a09, 0x08070605, 0x04030201, 0x000f0e0d);
pctx->_shiftconst[4] = _mm_set_epi32(0x0b0a0908, 0x07060504, 0x03020100, 0x0f0e0d0c);
pctx->_shiftconst[5] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);
pctx->_shiftconst[6] = _mm_set_epi32(0x09080706, 0x05040302, 0x01000f0e, 0x0d0c0b0a);
pctx->_shiftconst[7] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);
}
#define MUL_BITSLICE_2(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
x[7] = _mm_xor_si128(x[7], b6[i]);\
x[6] = _mm_xor_si128(x[6], b5[i]);\
x[5] = _mm_xor_si128(x[5], b4[i]);\
x[4] = _mm_xor_si128(x[4], b3[i]);\
x[4] = _mm_xor_si128(x[4], b7[i]);\
x[3] = _mm_xor_si128(x[3], b2[i]);\
x[3] = _mm_xor_si128(x[3], b7[i]);\
x[2] = _mm_xor_si128(x[2], b1[i]);\
x[1] = _mm_xor_si128(x[1], b0[i]);\
x[1] = _mm_xor_si128(x[1], b7[i]);\
x[0] = _mm_xor_si128(x[0], b7[i])
#define MUL_BITSLICE_3(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
x[7] = _mm_xor_si128(x[7], b6[i]);\
x[7] = _mm_xor_si128(x[7], b7[i]);\
x[6] = _mm_xor_si128(x[6], b5[i]);\
x[6] = _mm_xor_si128(x[6], b6[i]);\
x[5] = _mm_xor_si128(x[5], b4[i]);\
x[5] = _mm_xor_si128(x[5], b5[i]);\
x[4] = _mm_xor_si128(x[4], b3[i]);\
x[4] = _mm_xor_si128(x[4], b4[i]);\
x[4] = _mm_xor_si128(x[4], b7[i]);\
x[3] = _mm_xor_si128(x[3], b2[i]);\
x[3] = _mm_xor_si128(x[3], b3[i]);\
x[3] = _mm_xor_si128(x[3], b7[i]);\
x[2] = _mm_xor_si128(x[2], b1[i]);\
x[2] = _mm_xor_si128(x[2], b2[i]);\
x[1] = _mm_xor_si128(x[1], b0[i]);\
x[1] = _mm_xor_si128(x[1], b1[i]);\
x[1] = _mm_xor_si128(x[1], b7[i]);\
x[0] = _mm_xor_si128(x[0], b0[i]);\
x[0] = _mm_xor_si128(x[0], b7[i])
#define MUL_BITSLICE_4(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
x[7] = _mm_xor_si128(x[7], b5[i]);\
x[6] = _mm_xor_si128(x[6], b4[i]);\
x[5] = _mm_xor_si128(x[5], b3[i]);\
x[5] = _mm_xor_si128(x[5], b7[i]);\
x[4] = _mm_xor_si128(x[4], b2[i]);\
x[4] = _mm_xor_si128(x[4], b6[i]);\
x[4] = _mm_xor_si128(x[4], b7[i]);\
x[3] = _mm_xor_si128(x[3], b1[i]);\
x[3] = _mm_xor_si128(x[3], b6[i]);\
x[2] = _mm_xor_si128(x[2], b0[i]);\
x[2] = _mm_xor_si128(x[2], b7[i]);\
x[1] = _mm_xor_si128(x[1], b6[i]);\
x[1] = _mm_xor_si128(x[1], b7[i]);\
x[0] = _mm_xor_si128(x[0], b6[i])
#define MUL_BITSLICE_5(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
x[7] = _mm_xor_si128(x[7], b5[i]);\
x[7] = _mm_xor_si128(x[7], b7[i]);\
x[6] = _mm_xor_si128(x[6], b4[i]);\
x[6] = _mm_xor_si128(x[6], b6[i]);\
x[5] = _mm_xor_si128(x[5], b3[i]);\
x[5] = _mm_xor_si128(x[5], b5[i]);\
x[5] = _mm_xor_si128(x[5], b7[i]);\
x[4] = _mm_xor_si128(x[4], b2[i]);\
x[4] = _mm_xor_si128(x[4], b4[i]);\
x[4] = _mm_xor_si128(x[4], b6[i]);\
x[4] = _mm_xor_si128(x[4], b7[i]);\
x[3] = _mm_xor_si128(x[3], b1[i]);\
x[3] = _mm_xor_si128(x[3], b3[i]);\
x[3] = _mm_xor_si128(x[3], b6[i]);\
x[2] = _mm_xor_si128(x[2], b0[i]);\
x[2] = _mm_xor_si128(x[2], b2[i]);\
x[2] = _mm_xor_si128(x[2], b7[i]);\
x[1] = _mm_xor_si128(x[1], b1[i]);\
x[1] = _mm_xor_si128(x[1], b6[i]);\
x[1] = _mm_xor_si128(x[1], b7[i]);\
x[0] = _mm_xor_si128(x[0], b0[i]);\
x[0] = _mm_xor_si128(x[0], b6[i])
#define MUL_BITSLICE_7(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\
x[7] = _mm_xor_si128(x[7], b5[i]);\
x[7] = _mm_xor_si128(x[7], b6[i]);\
x[7] = _mm_xor_si128(x[7], b7[i]);\
x[6] = _mm_xor_si128(x[6], b4[i]);\
x[6] = _mm_xor_si128(x[6], b5[i]);\
x[6] = _mm_xor_si128(x[6], b6[i]);\
x[5] = _mm_xor_si128(x[5], b3[i]);\
x[5] = _mm_xor_si128(x[5], b4[i]);\
x[5] = _mm_xor_si128(x[5], b5[i]);\
x[5] = _mm_xor_si128(x[5], b7[i]);\
x[4] = _mm_xor_si128(x[4], b2[i]);\
x[4] = _mm_xor_si128(x[4], b3[i]);\
x[4] = _mm_xor_si128(x[4], b4[i]);\
x[4] = _mm_xor_si128(x[4], b6[i]);\
x[3] = _mm_xor_si128(x[3], b1[i]);\
x[3] = _mm_xor_si128(x[3], b2[i]);\
x[3] = _mm_xor_si128(x[3], b3[i]);\
x[3] = _mm_xor_si128(x[3], b6[i]);\
x[3] = _mm_xor_si128(x[3], b7[i]);\
x[2] = _mm_xor_si128(x[2], b0[i]);\
x[2] = _mm_xor_si128(x[2], b1[i]);\
x[2] = _mm_xor_si128(x[2], b2[i]);\
x[2] = _mm_xor_si128(x[2], b7[i]);\
x[1] = _mm_xor_si128(x[1], b0[i]);\
x[1] = _mm_xor_si128(x[1], b1[i]);\
x[1] = _mm_xor_si128(x[1], b6[i]);\
x[0] = _mm_xor_si128(x[0], b0[i]);\
x[0] = _mm_xor_si128(x[0], b6[i]);\
x[0] = _mm_xor_si128(x[0], b7[i])
#define ROW_L2L(x) _mm_and_si128(x, M128(_hiqmask))
#define ROW_L2R(x) _mm_srli_si128(x, 8)
#define ROW_R2L(x) _mm_slli_si128(x, 8)
#define ROW_R2R(x) _mm_and_si128(x, M128(_loqmask))
#define ROW_MOV_EO ROW_L2R
#define ROW_MOV_EE ROW_L2L
#define ROW_MOV_OE ROW_R2L
#define ROW_MOV_OO ROW_R2R
#define MUL_BITSLICE256_2(x, rm, i)\
x[7] = _mm_xor_si128(x[7], rm(p2[i]));\
x[6] = _mm_xor_si128(x[6], rm(p3[i]));\
x[5] = _mm_xor_si128(x[5], rm(p4[i]));\
x[4] = _mm_xor_si128(x[4], rm(q1[i]));\
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
x[3] = _mm_xor_si128(x[3], rm(q2[i]));\
x[3] = _mm_xor_si128(x[3], rm(p1[i]));\
x[2] = _mm_xor_si128(x[2], rm(q3[i]));\
x[1] = _mm_xor_si128(x[1], rm(q4[i]));\
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
x[0] = _mm_xor_si128(x[0], rm(p1[i]))
#define MUL_BITSLICE256_3(x, rm, i)\
x[7] = _mm_xor_si128(x[7], rm(p2[i]));\
x[7] = _mm_xor_si128(x[7], rm(p1[i]));\
x[6] = _mm_xor_si128(x[6], rm(p3[i]));\
x[6] = _mm_xor_si128(x[6], rm(p2[i]));\
x[5] = _mm_xor_si128(x[5], rm(p4[i]));\
x[5] = _mm_xor_si128(x[5], rm(p3[i]));\
x[4] = _mm_xor_si128(x[4], rm(q1[i]));\
x[4] = _mm_xor_si128(x[4], rm(p4[i]));\
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
x[3] = _mm_xor_si128(x[3], rm(q2[i]));\
x[3] = _mm_xor_si128(x[3], rm(q1[i]));\
x[3] = _mm_xor_si128(x[3], rm(p1[i]));\
x[2] = _mm_xor_si128(x[2], rm(q2[i]));\
x[2] = _mm_xor_si128(x[2], rm(q3[i]));\
x[1] = _mm_xor_si128(x[1], rm(q4[i]));\
x[1] = _mm_xor_si128(x[1], rm(q3[i]));\
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
x[0] = _mm_xor_si128(x[0], rm(q4[i]));\
x[0] = _mm_xor_si128(x[0], rm(p1[i]))
#define MUL_BITSLICE256_4(x, rm, i)\
x[7] = _mm_xor_si128(x[7], rm(p3[i]));\
x[6] = _mm_xor_si128(x[6], rm(p4[i]));\
x[5] = _mm_xor_si128(x[5], rm(q1[i]));\
x[5] = _mm_xor_si128(x[5], rm(p1[i]));\
x[4] = _mm_xor_si128(x[4], rm(q2[i]));\
x[4] = _mm_xor_si128(x[4], rm(p2[i]));\
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
x[3] = _mm_xor_si128(x[3], rm(q3[i]));\
x[3] = _mm_xor_si128(x[3], rm(p2[i]));\
x[2] = _mm_xor_si128(x[2], rm(q4[i]));\
x[2] = _mm_xor_si128(x[2], rm(p1[i]));\
x[1] = _mm_xor_si128(x[1], rm(p2[i]));\
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
x[0] = _mm_xor_si128(x[0], rm(p2[i]))
#define MUL_BITSLICE256_5(x, rm, i)\
x[7] = _mm_xor_si128(x[7], rm(p3[i]));\
x[7] = _mm_xor_si128(x[7], rm(p1[i]));\
x[6] = _mm_xor_si128(x[6], rm(p4[i]));\
x[6] = _mm_xor_si128(x[6], rm(p2[i]));\
x[5] = _mm_xor_si128(x[5], rm(q1[i]));\
x[5] = _mm_xor_si128(x[5], rm(p3[i]));\
x[5] = _mm_xor_si128(x[5], rm(p1[i]));\
x[4] = _mm_xor_si128(x[4], rm(q2[i]));\
x[4] = _mm_xor_si128(x[4], rm(p4[i]));\
x[4] = _mm_xor_si128(x[4], rm(p2[i]));\
x[4] = _mm_xor_si128(x[4], rm(p1[i]));\
x[3] = _mm_xor_si128(x[3], rm(q3[i]));\
x[3] = _mm_xor_si128(x[3], rm(q1[i]));\
x[3] = _mm_xor_si128(x[3], rm(p2[i]));\
x[2] = _mm_xor_si128(x[2], rm(q4[i]));\
x[2] = _mm_xor_si128(x[2], rm(q2[i]));\
x[2] = _mm_xor_si128(x[2], rm(p1[i]));\
x[1] = _mm_xor_si128(x[1], rm(q3[i]));\
x[1] = _mm_xor_si128(x[1], rm(p2[i]));\
x[1] = _mm_xor_si128(x[1], rm(p1[i]));\
x[0] = _mm_xor_si128(x[0], rm(q4[i]));\
x[0] = _mm_xor_si128(x[0], rm(p2[i]))
#define MUL_BITSLICE256_7(x, rm, i)\
x[7] = _mm_xor_si128(x[7], rm(p3[i]));\
x[7] = _mm_xor_si128(x[7], rm(p2[i]));\
x[7] = _mm_xor_si128(x[7], rm(p1[i]));\
x[6] = _mm_xor_si128(x[6], rm(p4[i]));\
x[6] = _mm_xor_si128(x[6], rm(p3[i]));\
x[6] = _mm_xor_si128(x[6], rm(p2[i]));\
x[5] = _mm_xor_si128(x[5], rm(q1[i]));\
x[5] = _mm_xor_si128(x[5], rm(p4[i]));\
x[5] = _mm_xor_si128(x[5], rm(p3[i]));\
x[5] = _mm_xor_si128(x[5], rm(p1[i]));\
x[4] = _mm_xor_si128(x[4], rm(q2[i]));\
x[4] = _mm_xor_si128(x[4], rm(q1[i]));\
x[4] = _mm_xor_si128(x[4], rm(p4[i]));\
x[4] = _mm_xor_si128(x[4], rm(p2[i]));\
x[3] = _mm_xor_si128(x[3], rm(q3[i]));\
x[3] = _mm_xor_si128(x[3], rm(q2[i]));\
x[3] = _mm_xor_si128(x[3], rm(q1[i]));\
x[3] = _mm_xor_si128(x[3], rm(p2[i]));\
x[3] = _mm_xor_si128(x[3], rm(p1[i]));\
x[2] = _mm_xor_si128(x[2], rm(q4[i]));\
x[2] = _mm_xor_si128(x[2], rm(q3[i]));\
x[2] = _mm_xor_si128(x[2], rm(q2[i]));\
x[2] = _mm_xor_si128(x[2], rm(p1[i]));\
x[1] = _mm_xor_si128(x[1], rm(q4[i]));\
x[1] = _mm_xor_si128(x[1], rm(q3[i]));\
x[1] = _mm_xor_si128(x[1], rm(p2[i]));\
x[0] = _mm_xor_si128(x[0], rm(q4[i]));\
x[0] = _mm_xor_si128(x[0], rm(p2[i]));\
x[0] = _mm_xor_si128(x[0], rm(p1[i]))
void Compress256(grssState *ctx,
const unsigned char *pmsg1, const unsigned char *pmsg2, const unsigned char *pmsg3, const unsigned char *pmsg4,
DataLength uBlockCount)
{
DataLength b;
unsigned int i, r;
__m128i x[8], t0, t1, t2, t3, t4, t5, t6, t7, u[4], u2[4];
__m128i p1[4], p2[4], p3[4], p4[4], q1[4], q2[4], q3[4], q4[4];
__m128i r1[8], r2[8], r3[8], r4[8], s1[8], s2[8], s3[8], s4[8];
__m128i x01[8], x23[8], x45[8], x67[8];
__m128i x0[8], x1[8], x2[8], x3[8], x4[8], x5[8], x6[8], x7[8];
for(i = 0; i < 8; i++)
x[i] = _mm_xor_si128(x[i], x[i]);
// transpose cv
TRANSPOSE(ctx->state1, u, u2);
TRANSPOSE(ctx->state2, u, u2);
TRANSPOSE(ctx->state3, u, u2);
TRANSPOSE(ctx->state4, u, u2);
for(b = 0; b < uBlockCount; b++)
{
q1[0] = _mm_loadu_si128((__m128i*)pmsg1 + 0);
q1[1] = _mm_loadu_si128((__m128i*)pmsg1 + 1);
q1[2] = _mm_loadu_si128((__m128i*)pmsg1 + 2);
q1[3] = _mm_loadu_si128((__m128i*)pmsg1 + 3);
q2[0] = _mm_loadu_si128((__m128i*)pmsg2 + 0);
q2[1] = _mm_loadu_si128((__m128i*)pmsg2 + 1);
q2[2] = _mm_loadu_si128((__m128i*)pmsg2 + 2);
q2[3] = _mm_loadu_si128((__m128i*)pmsg2 + 3);
q3[0] = _mm_loadu_si128((__m128i*)pmsg3 + 0);
q3[1] = _mm_loadu_si128((__m128i*)pmsg3 + 1);
q3[2] = _mm_loadu_si128((__m128i*)pmsg3 + 2);
q3[3] = _mm_loadu_si128((__m128i*)pmsg3 + 3);
q4[0] = _mm_loadu_si128((__m128i*)pmsg4 + 0);
q4[1] = _mm_loadu_si128((__m128i*)pmsg4 + 1);
q4[2] = _mm_loadu_si128((__m128i*)pmsg4 + 2);
q4[3] = _mm_loadu_si128((__m128i*)pmsg4 + 3);
// transpose message
TRANSPOSE(q1, u, u2);
TRANSPOSE(q2, u, u2);
TRANSPOSE(q3, u, u2);
TRANSPOSE(q4, u, u2);
// xor cv and message
for(i = 0; i < 4; i++)
{
p1[i] = _mm_xor_si128(ctx->state1[i], q1[i]);
p2[i] = _mm_xor_si128(ctx->state2[i], q2[i]);
p3[i] = _mm_xor_si128(ctx->state3[i], q3[i]);
p4[i] = _mm_xor_si128(ctx->state4[i], q4[i]);
}
BITSLICE(p1[0], p2[0], p3[0], p4[0], q1[0], q2[0], q3[0], q4[0], t0);
BITSLICE(p1[1], p2[1], p3[1], p4[1], q1[1], q2[1], q3[1], q4[1], t0);
BITSLICE(p1[2], p2[2], p3[2], p4[2], q1[2], q2[2], q3[2], q4[2], t0);
BITSLICE(p1[3], p2[3], p3[3], p4[3], q1[3], q2[3], q3[3], q4[3], t0);
for(r = 0; r < 10; r++)
{
// Add const
p1[0] = _mm_xor_si128(p1[0], ctx->_Pconst[r][0]);
p2[0] = _mm_xor_si128(p2[0], ctx->_Pconst[r][1]);
p3[0] = _mm_xor_si128(p3[0], ctx->_Pconst[r][2]);
p4[0] = _mm_xor_si128(p4[0], ctx->_Pconst[r][3]);
q1[0] = _mm_xor_si128(q1[0], ctx->_Pconst[r][4]);
q2[0] = _mm_xor_si128(q2[0], ctx->_Pconst[r][5]);
q3[0] = _mm_xor_si128(q3[0], ctx->_Pconst[r][6]);
q4[0] = _mm_xor_si128(q4[0], ctx->_Pconst[r][7]);
p1[3] = _mm_xor_si128(p1[3], ctx->_Qconst[r][0]);
p2[3] = _mm_xor_si128(p2[3], ctx->_Qconst[r][1]);
p3[3] = _mm_xor_si128(p3[3], ctx->_Qconst[r][2]);
p4[3] = _mm_xor_si128(p4[3], ctx->_Qconst[r][3]);
q1[3] = _mm_xor_si128(q1[3], ctx->_Qconst[r][4]);
q2[3] = _mm_xor_si128(q2[3], ctx->_Qconst[r][5]);
q3[3] = _mm_xor_si128(q3[3], ctx->_Qconst[r][6]);
q4[3] = _mm_xor_si128(q4[3], ctx->_Qconst[r][7]);
// Sub bytes
SUBSTITUTE_BITSLICE(q4[0], q3[0], q2[0], q1[0], p4[0], p3[0], p2[0], p1[0], t0, t1, t2, t3, t4, t5, t6, t7);
SUBSTITUTE_BITSLICE(q4[1], q3[1], q2[1], q1[1], p4[1], p3[1], p2[1], p1[1], t0, t1, t2, t3, t4, t5, t6, t7);
SUBSTITUTE_BITSLICE(q4[2], q3[2], q2[2], q1[2], p4[2], p3[2], p2[2], p1[2], t0, t1, t2, t3, t4, t5, t6, t7);
SUBSTITUTE_BITSLICE(q4[3], q3[3], q2[3], q1[3], p4[3], p3[3], p2[3], p1[3], t0, t1, t2, t3, t4, t5, t6, t7);
// Shift bytes
p1[0] = _mm_shuffle_epi8(p1[0], ctx->_shiftconst[0]);
p2[0] = _mm_shuffle_epi8(p2[0], ctx->_shiftconst[0]);
p3[0] = _mm_shuffle_epi8(p3[0], ctx->_shiftconst[0]);
p4[0] = _mm_shuffle_epi8(p4[0], ctx->_shiftconst[0]);
q1[0] = _mm_shuffle_epi8(q1[0], ctx->_shiftconst[0]);
q2[0] = _mm_shuffle_epi8(q2[0], ctx->_shiftconst[0]);
q3[0] = _mm_shuffle_epi8(q3[0], ctx->_shiftconst[0]);
q4[0] = _mm_shuffle_epi8(q4[0], ctx->_shiftconst[0]);
p1[1] = _mm_shuffle_epi8(p1[1], ctx->_shiftconst[1]);
p2[1] = _mm_shuffle_epi8(p2[1], ctx->_shiftconst[1]);
p3[1] = _mm_shuffle_epi8(p3[1], ctx->_shiftconst[1]);
p4[1] = _mm_shuffle_epi8(p4[1], ctx->_shiftconst[1]);
q1[1] = _mm_shuffle_epi8(q1[1], ctx->_shiftconst[1]);
q2[1] = _mm_shuffle_epi8(q2[1], ctx->_shiftconst[1]);
q3[1] = _mm_shuffle_epi8(q3[1], ctx->_shiftconst[1]);
q4[1] = _mm_shuffle_epi8(q4[1], ctx->_shiftconst[1]);
p1[2] = _mm_shuffle_epi8(p1[2], ctx->_shiftconst[2]);
p2[2] = _mm_shuffle_epi8(p2[2], ctx->_shiftconst[2]);
p3[2] = _mm_shuffle_epi8(p3[2], ctx->_shiftconst[2]);
p4[2] = _mm_shuffle_epi8(p4[2], ctx->_shiftconst[2]);
q1[2] = _mm_shuffle_epi8(q1[2], ctx->_shiftconst[2]);
q2[2] = _mm_shuffle_epi8(q2[2], ctx->_shiftconst[2]);
q3[2] = _mm_shuffle_epi8(q3[2], ctx->_shiftconst[2]);
q4[2] = _mm_shuffle_epi8(q4[2], ctx->_shiftconst[2]);
p1[3] = _mm_shuffle_epi8(p1[3], ctx->_shiftconst[3]);
p2[3] = _mm_shuffle_epi8(p2[3], ctx->_shiftconst[3]);
p3[3] = _mm_shuffle_epi8(p3[3], ctx->_shiftconst[3]);
p4[3] = _mm_shuffle_epi8(p4[3], ctx->_shiftconst[3]);
q1[3] = _mm_shuffle_epi8(q1[3], ctx->_shiftconst[3]);
q2[3] = _mm_shuffle_epi8(q2[3], ctx->_shiftconst[3]);
q3[3] = _mm_shuffle_epi8(q3[3], ctx->_shiftconst[3]);
q4[3] = _mm_shuffle_epi8(q4[3], ctx->_shiftconst[3]);
// Mix bytes
#if 0
for(i = 0; i < 4; i++)
{
r1[2 * i + 0] = _mm_srli_si128(p1[i], 8);
r1[2 * i + 1] = _mm_and_si128(p1[i], M128(_loqmask));
r2[2 * i + 0] = _mm_srli_si128(p2[i], 8);
r2[2 * i + 1] = _mm_and_si128(p2[i], M128(_loqmask));
r3[2 * i + 0] = _mm_srli_si128(p3[i], 8);
r3[2 * i + 1] = _mm_and_si128(p3[i], M128(_loqmask));
r4[2 * i + 0] = _mm_srli_si128(p4[i], 8);
r4[2 * i + 1] = _mm_and_si128(p4[i], M128(_loqmask));
s1[2 * i + 0] = _mm_srli_si128(q1[i], 8);
s1[2 * i + 1] = _mm_and_si128(q1[i], M128(_loqmask));
s2[2 * i + 0] = _mm_srli_si128(q2[i], 8);
s2[2 * i + 1] = _mm_and_si128(q2[i], M128(_loqmask));
s3[2 * i + 0] = _mm_srli_si128(q3[i], 8);
s3[2 * i + 1] = _mm_and_si128(q3[i], M128(_loqmask));
s4[2 * i + 0] = _mm_srli_si128(q4[i], 8);
s4[2 * i + 1] = _mm_and_si128(q4[i], M128(_loqmask));
}
for(i = 0; i < 8; i++)
{
x0[i] = _mm_xor_si128(x0[i], x0[i]);
x1[i] = _mm_xor_si128(x1[i], x1[i]);
x2[i] = _mm_xor_si128(x2[i], x2[i]);
x3[i] = _mm_xor_si128(x3[i], x3[i]);
x4[i] = _mm_xor_si128(x4[i], x4[i]);
x5[i] = _mm_xor_si128(x5[i], x5[i]);
x6[i] = _mm_xor_si128(x6[i], x6[i]);
x7[i] = _mm_xor_si128(x7[i], x7[i]);
}
MUL_BITSLICE_2(x0, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x0, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x0, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x0, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x0, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x0, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x0, 6, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x0, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x1, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x1, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x1, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x1, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x1, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x1, 6, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x1, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x1, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x2, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x2, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x2, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x2, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x2, 6, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x2, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x2, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x2, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x3, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x3, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x3, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x3, 6, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x3, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x3, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x3, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x3, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x4, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x4, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x4, 6, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x4, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x4, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x4, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x4, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x4, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x5, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x5, 6, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x5, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x5, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x5, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x5, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x5, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x5, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x6, 6, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x6, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x6, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x6, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x6, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x6, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x6, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x6, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x7, 7, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_2(x7, 0, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x7, 1, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_4(x7, 2, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x7, 3, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_3(x7, 4, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_5(x7, 5, r1, r2, r3, r4, s1, s2, s3, s4);
MUL_BITSLICE_7(x7, 6, r1, r2, r3, r4, s1, s2, s3, s4);
p1[0] = _mm_unpacklo_epi64(x1[7], x0[7]);
p2[0] = _mm_unpacklo_epi64(x1[6], x0[6]);
p3[0] = _mm_unpacklo_epi64(x1[5], x0[5]);
p4[0] = _mm_unpacklo_epi64(x1[4], x0[4]);
q1[0] = _mm_unpacklo_epi64(x1[3], x0[3]);
q2[0] = _mm_unpacklo_epi64(x1[2], x0[2]);
q3[0] = _mm_unpacklo_epi64(x1[1], x0[1]);
q4[0] = _mm_unpacklo_epi64(x1[0], x0[0]);
p1[1] = _mm_unpacklo_epi64(x3[7], x2[7]);
p2[1] = _mm_unpacklo_epi64(x3[6], x2[6]);
p3[1] = _mm_unpacklo_epi64(x3[5], x2[5]);
p4[1] = _mm_unpacklo_epi64(x3[4], x2[4]);
q1[1] = _mm_unpacklo_epi64(x3[3], x2[3]);
q2[1] = _mm_unpacklo_epi64(x3[2], x2[2]);
q3[1] = _mm_unpacklo_epi64(x3[1], x2[1]);
q4[1] = _mm_unpacklo_epi64(x3[0], x2[0]);
p1[2] = _mm_unpacklo_epi64(x5[7], x4[7]);
p2[2] = _mm_unpacklo_epi64(x5[6], x4[6]);
p3[2] = _mm_unpacklo_epi64(x5[5], x4[5]);
p4[2] = _mm_unpacklo_epi64(x5[4], x4[4]);
q1[2] = _mm_unpacklo_epi64(x5[3], x4[3]);
q2[2] = _mm_unpacklo_epi64(x5[2], x4[2]);
q3[2] = _mm_unpacklo_epi64(x5[1], x4[1]);
q4[2] = _mm_unpacklo_epi64(x5[0], x4[0]);
p1[3] = _mm_unpacklo_epi64(x7[7], x6[7]);
p2[3] = _mm_unpacklo_epi64(x7[6], x6[6]);
p3[3] = _mm_unpacklo_epi64(x7[5], x6[5]);
p4[3] = _mm_unpacklo_epi64(x7[4], x6[4]);
q1[3] = _mm_unpacklo_epi64(x7[3], x6[3]);
q2[3] = _mm_unpacklo_epi64(x7[2], x6[2]);
q3[3] = _mm_unpacklo_epi64(x7[1], x6[1]);
q4[3] = _mm_unpacklo_epi64(x7[0], x6[0]);
#else
for(i = 0; i < 8; i ++)
{
x01[i] = _mm_xor_si128(x01[i], x01[i]);
x23[i] = _mm_xor_si128(x23[i], x23[i]);
x45[i] = _mm_xor_si128(x45[i], x45[i]);
x67[i] = _mm_xor_si128(x67[i], x67[i]);
}
// row 1
MUL_BITSLICE256_2(x01, ROW_MOV_EE, 0);
MUL_BITSLICE256_2(x01, ROW_MOV_OE, 0);
MUL_BITSLICE256_3(x01, ROW_MOV_EE, 1);
MUL_BITSLICE256_4(x01, ROW_MOV_OE, 1);
MUL_BITSLICE256_5(x01, ROW_MOV_EE, 2);
MUL_BITSLICE256_3(x01, ROW_MOV_OE, 2);
MUL_BITSLICE256_5(x01, ROW_MOV_EE, 3);
MUL_BITSLICE256_7(x01, ROW_MOV_OE, 3);
// row2
MUL_BITSLICE256_7(x01, ROW_MOV_EO, 0);
MUL_BITSLICE256_2(x01, ROW_MOV_OO, 0);
MUL_BITSLICE256_2(x01, ROW_MOV_EO, 1);
MUL_BITSLICE256_3(x01, ROW_MOV_OO, 1);
MUL_BITSLICE256_4(x01, ROW_MOV_EO, 2);
MUL_BITSLICE256_5(x01, ROW_MOV_OO, 2);
MUL_BITSLICE256_3(x01, ROW_MOV_EO, 3);
MUL_BITSLICE256_5(x01, ROW_MOV_OO, 3);
// row 3
MUL_BITSLICE256_5(x23, ROW_MOV_EE, 0);
MUL_BITSLICE256_7(x23, ROW_MOV_OE, 0);
MUL_BITSLICE256_2(x23, ROW_MOV_EE, 1);
MUL_BITSLICE256_2(x23, ROW_MOV_OE, 1);
MUL_BITSLICE256_3(x23, ROW_MOV_EE, 2);
MUL_BITSLICE256_4(x23, ROW_MOV_OE, 2);
MUL_BITSLICE256_5(x23, ROW_MOV_EE, 3);
MUL_BITSLICE256_3(x23, ROW_MOV_OE, 3);
// row 4
MUL_BITSLICE256_3(x23, ROW_MOV_EO, 0);
MUL_BITSLICE256_5(x23, ROW_MOV_OO, 0);
MUL_BITSLICE256_7(x23, ROW_MOV_EO, 1);
MUL_BITSLICE256_2(x23, ROW_MOV_OO, 1);
MUL_BITSLICE256_2(x23, ROW_MOV_EO, 2);
MUL_BITSLICE256_3(x23, ROW_MOV_OO, 2);
MUL_BITSLICE256_4(x23, ROW_MOV_EO, 3);
MUL_BITSLICE256_5(x23, ROW_MOV_OO, 3);
// row 5
MUL_BITSLICE256_5(x45, ROW_MOV_EE, 0);
MUL_BITSLICE256_3(x45, ROW_MOV_OE, 0);
MUL_BITSLICE256_5(x45, ROW_MOV_EE, 1);
MUL_BITSLICE256_7(x45, ROW_MOV_OE, 1);
MUL_BITSLICE256_2(x45, ROW_MOV_EE, 2);
MUL_BITSLICE256_2(x45, ROW_MOV_OE, 2);
MUL_BITSLICE256_3(x45, ROW_MOV_EE, 3);
MUL_BITSLICE256_4(x45, ROW_MOV_OE, 3);
// row 6
MUL_BITSLICE256_4(x45, ROW_MOV_EO, 0);
MUL_BITSLICE256_5(x45, ROW_MOV_OO, 0);
MUL_BITSLICE256_3(x45, ROW_MOV_EO, 1);
MUL_BITSLICE256_5(x45, ROW_MOV_OO, 1);
MUL_BITSLICE256_7(x45, ROW_MOV_EO, 2);
MUL_BITSLICE256_2(x45, ROW_MOV_OO, 2);
MUL_BITSLICE256_2(x45, ROW_MOV_EO, 3);
MUL_BITSLICE256_3(x45, ROW_MOV_OO, 3);
// row 7
MUL_BITSLICE256_3(x67, ROW_MOV_EE, 0);
MUL_BITSLICE256_4(x67, ROW_MOV_OE, 0);
MUL_BITSLICE256_5(x67, ROW_MOV_EE, 1);
MUL_BITSLICE256_3(x67, ROW_MOV_OE, 1);
MUL_BITSLICE256_5(x67, ROW_MOV_EE, 2);
MUL_BITSLICE256_7(x67, ROW_MOV_OE, 2);
MUL_BITSLICE256_2(x67, ROW_MOV_EE, 3);
MUL_BITSLICE256_2(x67, ROW_MOV_OE, 3);
// row 8
MUL_BITSLICE256_2(x67, ROW_MOV_EO, 0);
MUL_BITSLICE256_3(x67, ROW_MOV_OO, 0);
MUL_BITSLICE256_4(x67, ROW_MOV_EO, 1);
MUL_BITSLICE256_5(x67, ROW_MOV_OO, 1);
MUL_BITSLICE256_3(x67, ROW_MOV_EO, 2);
MUL_BITSLICE256_5(x67, ROW_MOV_OO, 2);
MUL_BITSLICE256_7(x67, ROW_MOV_EO, 3);
MUL_BITSLICE256_2(x67, ROW_MOV_OO, 3);
p1[0] = x01[7];
p2[0] = x01[6];
p3[0] = x01[5];
p4[0] = x01[4];
q1[0] = x01[3];
q2[0] = x01[2];
q3[0] = x01[1];
q4[0] = x01[0];
p1[1] = x23[7];
p2[1] = x23[6];
p3[1] = x23[5];
p4[1] = x23[4];
q1[1] = x23[3];
q2[1] = x23[2];
q3[1] = x23[1];
q4[1] = x23[0];
p1[2] = x45[7];
p2[2] = x45[6];
p3[2] = x45[5];
p4[2] = x45[4];
q1[2] = x45[3];
q2[2] = x45[2];
q3[2] = x45[1];
q4[2] = x45[0];
p1[3] = x67[7];
p2[3] = x67[6];
p3[3] = x67[5];
p4[3] = x67[4];
q1[3] = x67[3];
q2[3] = x67[2];
q3[3] = x67[1];
q4[3] = x67[0];
#endif
}
BITSLICE(p1[0], p2[0], p3[0], p4[0], q1[0], q2[0], q3[0], q4[0], t0);
BITSLICE(p1[1], p2[1], p3[1], p4[1], q1[1], q2[1], q3[1], q4[1], t0);
BITSLICE(p1[2], p2[2], p3[2], p4[2], q1[2], q2[2], q3[2], q4[2], t0);
BITSLICE(p1[3], p2[3], p3[3], p4[3], q1[3], q2[3], q3[3], q4[3], t0);
// P ^ Q
for(i = 0; i < 4; i++)
{
ctx->state1[i] = _mm_xor_si128(ctx->state1[i], _mm_xor_si128(p1[i], q1[i]));
ctx->state2[i] = _mm_xor_si128(ctx->state2[i], _mm_xor_si128(p2[i], q2[i]));
ctx->state3[i] = _mm_xor_si128(ctx->state3[i], _mm_xor_si128(p3[i], q3[i]));
ctx->state4[i] = _mm_xor_si128(ctx->state4[i], _mm_xor_si128(p4[i], q4[i]));
}
pmsg1 += 64;
pmsg2 += 64;
pmsg3 += 64;
pmsg4 += 64;
}
// transpose state back
TRANSPOSE_BACK(ctx->state1, u, u2);
TRANSPOSE_BACK(ctx->state2, u, u2);
TRANSPOSE_BACK(ctx->state3, u, u2);
TRANSPOSE_BACK(ctx->state4, u, u2);
}
#define TRANSPOSE512(m, u, v)\
u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\
u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\
u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\
u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\
u[4] = _mm_shuffle_epi8(m[4], M128(_transpose1));\
u[5] = _mm_shuffle_epi8(m[5], M128(_transpose1));\
u[6] = _mm_shuffle_epi8(m[6], M128(_transpose1));\
u[7] = _mm_shuffle_epi8(m[7], M128(_transpose1));\
v[0] = _mm_unpacklo_epi16(u[7], u[6]);\
v[1] = _mm_unpacklo_epi16(u[5], u[4]);\
v[2] = _mm_unpacklo_epi16(u[3], u[2]);\
v[3] = _mm_unpacklo_epi16(u[1], u[0]);\
v[4] = _mm_unpackhi_epi16(u[7], u[6]);\
v[5] = _mm_unpackhi_epi16(u[5], u[4]);\
v[6] = _mm_unpackhi_epi16(u[3], u[2]);\
v[7] = _mm_unpackhi_epi16(u[1], u[0]);\
u[0] = _mm_unpackhi_epi32(v[6], v[7]);\
u[1] = _mm_unpacklo_epi32(v[6], v[7]);\
u[2] = _mm_unpackhi_epi32(v[4], v[5]);\
u[3] = _mm_unpacklo_epi32(v[4], v[5]);\
u[4] = _mm_unpackhi_epi32(v[2], v[3]);\
u[5] = _mm_unpacklo_epi32(v[2], v[3]);\
u[6] = _mm_unpackhi_epi32(v[0], v[1]);\
u[7] = _mm_unpacklo_epi32(v[0], v[1]);\
m[0] = _mm_unpackhi_epi64(u[2], u[0]);\
m[1] = _mm_unpacklo_epi64(u[2], u[0]);\
m[2] = _mm_unpackhi_epi64(u[3], u[1]);\
m[3] = _mm_unpacklo_epi64(u[3], u[1]);\
m[4] = _mm_unpackhi_epi64(u[6], u[4]);\
m[5] = _mm_unpacklo_epi64(u[6], u[4]);\
m[6] = _mm_unpackhi_epi64(u[7], u[5]);\
m[7] = _mm_unpacklo_epi64(u[7], u[5])
#define TRANSPOSE512_BACK(m, u, v)\
u[0] = _mm_shuffle_epi8(m[0], M128(_invmask));\
u[1] = _mm_shuffle_epi8(m[1], M128(_invmask));\
u[2] = _mm_shuffle_epi8(m[2], M128(_invmask));\
u[3] = _mm_shuffle_epi8(m[3], M128(_invmask));\
u[4] = _mm_shuffle_epi8(m[4], M128(_invmask));\
u[5] = _mm_shuffle_epi8(m[5], M128(_invmask));\
u[6] = _mm_shuffle_epi8(m[6], M128(_invmask));\
u[7] = _mm_shuffle_epi8(m[7], M128(_invmask));\
v[0] = _mm_unpacklo_epi8(u[0], u[1]);\
v[1] = _mm_unpacklo_epi8(u[2], u[3]);\
v[2] = _mm_unpacklo_epi8(u[4], u[5]);\
v[3] = _mm_unpacklo_epi8(u[6], u[7]);\
v[4] = _mm_unpackhi_epi8(u[0], u[1]);\
v[5] = _mm_unpackhi_epi8(u[2], u[3]);\
v[6] = _mm_unpackhi_epi8(u[4], u[5]);\
v[7] = _mm_unpackhi_epi8(u[6], u[7]);\
u[0] = _mm_unpacklo_epi16(v[0], v[1]);\
u[1] = _mm_unpacklo_epi16(v[2], v[3]);\
u[2] = _mm_unpacklo_epi16(v[4], v[5]);\
u[3] = _mm_unpacklo_epi16(v[6], v[7]);\
u[4] = _mm_unpackhi_epi16(v[0], v[1]);\
u[5] = _mm_unpackhi_epi16(v[2], v[3]);\
u[6] = _mm_unpackhi_epi16(v[4], v[5]);\
u[7] = _mm_unpackhi_epi16(v[6], v[7]);\
m[0] = _mm_unpacklo_epi32(u[0], u[1]);\
m[1] = _mm_unpackhi_epi32(u[0], u[1]);\
m[2] = _mm_unpacklo_epi32(u[4], u[5]);\
m[3] = _mm_unpackhi_epi32(u[4], u[5]);\
m[4] = _mm_unpacklo_epi32(u[2], u[3]);\
m[5] = _mm_unpackhi_epi32(u[2], u[3]);\
m[6] = _mm_unpacklo_epi32(u[6], u[7]);\
m[7] = _mm_unpackhi_epi32(u[6], u[7])
void Compress512(grssState *ctx,
const unsigned char *pmsg1, const unsigned char *pmsg2, const unsigned char *pmsg3, const unsigned char *pmsg4,
DataLength uBlockCount)
{
__m128i u[8], v[8], p1[8], p2[8], p3[8], p4[8], q1[8], q2[8], q3[8], q4[8], t;
__m128i t0, t1, t2, t3, s0, s1, s2, s3;
__m128i x0[8], x1[8], x2[8], x3[8], x4[8], x5[8], x6[8], x7[8];
DataLength b;
unsigned int i, r;
// transpose cv
TRANSPOSE512(ctx->state1, u, v);
TRANSPOSE512(ctx->state2, u, v);
TRANSPOSE512(ctx->state3, u, v);
TRANSPOSE512(ctx->state4, u, v);
for(b = 0; b < uBlockCount; b++)
{
// load message
for(i = 0; i < 8; i++)
{
q1[i] = _mm_loadu_si128((__m128i*)pmsg1 + i);
q2[i] = _mm_loadu_si128((__m128i*)pmsg2 + i);
q3[i] = _mm_loadu_si128((__m128i*)pmsg3 + i);
q4[i] = _mm_loadu_si128((__m128i*)pmsg4 + i);
}
// transpose message
TRANSPOSE512(q1, u, v);
TRANSPOSE512(q2, u, v);
TRANSPOSE512(q3, u, v);
TRANSPOSE512(q4, u, v);
// xor cv and message
for(i = 0; i < 8; i++)
{
p1[i] = _mm_xor_si128(ctx->state1[i], q1[i]);
p2[i] = _mm_xor_si128(ctx->state2[i], q2[i]);
p3[i] = _mm_xor_si128(ctx->state3[i], q3[i]);
p4[i] = _mm_xor_si128(ctx->state4[i], q4[i]);
}
for(i = 0; i < 8; i++)
{
BITSLICE(p1[i], p2[i], p3[i], p4[i], q1[i], q2[i], q3[i], q4[i], t);
}
for(r = 0; r < 14; r++)
{
// add constant
p1[0] = _mm_xor_si128(p1[0], ctx->_Pconst[r][0]);
p2[0] = _mm_xor_si128(p2[0], ctx->_Pconst[r][1]);
p3[0] = _mm_xor_si128(p3[0], ctx->_Pconst[r][2]);
p4[0] = _mm_xor_si128(p4[0], ctx->_Pconst[r][3]);
q1[0] = _mm_xor_si128(q1[0], ctx->_Pconst[r][4]);
q2[0] = _mm_xor_si128(q2[0], ctx->_Pconst[r][5]);
q3[0] = _mm_xor_si128(q3[0], ctx->_Pconst[r][6]);
q4[0] = _mm_xor_si128(q4[0], ctx->_Pconst[r][7]);
p1[7] = _mm_xor_si128(p1[7], ctx->_Qconst[r][0]);
p2[7] = _mm_xor_si128(p2[7], ctx->_Qconst[r][1]);
p3[7] = _mm_xor_si128(p3[7], ctx->_Qconst[r][2]);
p4[7] = _mm_xor_si128(p4[7], ctx->_Qconst[r][3]);
q1[7] = _mm_xor_si128(q1[7], ctx->_Qconst[r][4]);
q2[7] = _mm_xor_si128(q2[7], ctx->_Qconst[r][5]);
q3[7] = _mm_xor_si128(q3[7], ctx->_Qconst[r][6]);
q4[7] = _mm_xor_si128(q4[7], ctx->_Qconst[r][7]);
// sub bytes
for(i = 0; i < 8; i++)
{
SUBSTITUTE_BITSLICE(q4[i], q3[i], q2[i], q1[i], p4[i], p3[i], p2[i], p1[i], t0, t1, t2, t3, s0, s1, s2, s3);
}
// shift bytes
for(i = 1; i < 8; i++)
{
p1[i] = _mm_shuffle_epi8(p1[i], ctx->_shiftconst[i]);
p2[i] = _mm_shuffle_epi8(p2[i], ctx->_shiftconst[i]);
p3[i] = _mm_shuffle_epi8(p3[i], ctx->_shiftconst[i]);
p4[i] = _mm_shuffle_epi8(p4[i], ctx->_shiftconst[i]);
q1[i] = _mm_shuffle_epi8(q1[i], ctx->_shiftconst[i]);
q2[i] = _mm_shuffle_epi8(q2[i], ctx->_shiftconst[i]);
q3[i] = _mm_shuffle_epi8(q3[i], ctx->_shiftconst[i]);
q4[i] = _mm_shuffle_epi8(q4[i], ctx->_shiftconst[i]);
}
// mix bytes
for(i = 0; i < 8; i++)
{
x0[i] = _mm_xor_si128(x0[i], x0[i]);
x1[i] = _mm_xor_si128(x1[i], x1[i]);
x2[i] = _mm_xor_si128(x2[i], x2[i]);
x3[i] = _mm_xor_si128(x3[i], x3[i]);
x4[i] = _mm_xor_si128(x4[i], x4[i]);
x5[i] = _mm_xor_si128(x5[i], x5[i]);
x6[i] = _mm_xor_si128(x6[i], x6[i]);
x7[i] = _mm_xor_si128(x7[i], x7[i]);
}
MUL_BITSLICE_2(x0, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x0, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x0, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x0, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x0, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x0, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x0, 6, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x0, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x1, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x1, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x1, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x1, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x1, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x1, 6, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x1, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x1, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x2, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x2, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x2, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x2, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x2, 6, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x2, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x2, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x2, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x3, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x3, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x3, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x3, 6, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x3, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x3, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x3, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x3, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x4, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x4, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x4, 6, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x4, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x4, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x4, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x4, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x4, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x5, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x5, 6, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x5, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x5, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x5, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x5, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x5, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x5, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x6, 6, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x6, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x6, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x6, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x6, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x6, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x6, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x6, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x7, 7, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_2(x7, 0, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x7, 1, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_4(x7, 2, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x7, 3, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_3(x7, 4, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_5(x7, 5, p1, p2, p3, p4, q1, q2, q3, q4);
MUL_BITSLICE_7(x7, 6, p1, p2, p3, p4, q1, q2, q3, q4);
p1[0] = x0[7];
p2[0] = x0[6];
p3[0] = x0[5];
p4[0] = x0[4];
q1[0] = x0[3];
q2[0] = x0[2];
q3[0] = x0[1];
q4[0] = x0[0];
p1[1] = x1[7];
p2[1] = x1[6];
p3[1] = x1[5];
p4[1] = x1[4];
q1[1] = x1[3];
q2[1] = x1[2];
q3[1] = x1[1];
q4[1] = x1[0];
p1[2] = x2[7];
p2[2] = x2[6];
p3[2] = x2[5];
p4[2] = x2[4];
q1[2] = x2[3];
q2[2] = x2[2];
q3[2] = x2[1];
q4[2] = x2[0];
p1[3] = x3[7];
p2[3] = x3[6];
p3[3] = x3[5];
p4[3] = x3[4];
q1[3] = x3[3];
q2[3] = x3[2];
q3[3] = x3[1];
q4[3] = x3[0];
p1[4] = x4[7];
p2[4] = x4[6];
p3[4] = x4[5];
p4[4] = x4[4];
q1[4] = x4[3];
q2[4] = x4[2];
q3[4] = x4[1];
q4[4] = x4[0];
p1[5] = x5[7];
p2[5] = x5[6];
p3[5] = x5[5];
p4[5] = x5[4];
q1[5] = x5[3];
q2[5] = x5[2];
q3[5] = x5[1];
q4[5] = x5[0];
p1[6] = x6[7];
p2[6] = x6[6];
p3[6] = x6[5];
p4[6] = x6[4];
q1[6] = x6[3];
q2[6] = x6[2];
q3[6] = x6[1];
q4[6] = x6[0];
p1[7] = x7[7];
p2[7] = x7[6];
p3[7] = x7[5];
p4[7] = x7[4];
q1[7] = x7[3];
q2[7] = x7[2];
q3[7] = x7[1];
q4[7] = x7[0];
}
for(i = 0; i < 8; i++)
{
BITSLICE(p1[i], p2[i], p3[i], p4[i], q1[i], q2[i], q3[i], q4[i], t);
}
for(i = 0; i < 8; i++)
{
ctx->state1[i] = _mm_xor_si128(ctx->state1[i], _mm_xor_si128(p1[i], q1[i]));
ctx->state2[i] = _mm_xor_si128(ctx->state2[i], _mm_xor_si128(p2[i], q2[i]));
ctx->state3[i] = _mm_xor_si128(ctx->state3[i], _mm_xor_si128(p3[i], q3[i]));
ctx->state4[i] = _mm_xor_si128(ctx->state4[i], _mm_xor_si128(p4[i], q4[i]));
}
}
TRANSPOSE512_BACK(ctx->state1, u, v);
TRANSPOSE512_BACK(ctx->state2, u, v);
TRANSPOSE512_BACK(ctx->state3, u, v);
TRANSPOSE512_BACK(ctx->state4, u, v);
}
void grssInit(grssState *pctx, int grssbitlen)
{
pctx->uHashLength = grssbitlen;
switch(grssbitlen)
{
case 256:
pctx->uBlockLength = 64;
Init256(pctx);
break;
case 512:
pctx->uBlockLength = 128;
Init512(pctx);
break;
}
}
void grssUpdate(grssState *state, const BitSequence *data, DataLength databitlen)
{
DataLength uByteLength, uBlockCount;
uByteLength = databitlen / 8;
uBlockCount = uByteLength / state->uBlockLength;
if(state->uHashLength == 256)
{
Compress256(state,
data + 0 * (uBlockCount / 4) * state->uBlockLength,
data + 1 * (uBlockCount / 4) * state->uBlockLength,
data + 2 * (uBlockCount / 4) * state->uBlockLength,
data + 3 * (uBlockCount / 4) * state->uBlockLength,
uBlockCount / 4);
}
else
{
Compress512(state,
data + 0 * (uBlockCount / 4) * state->uBlockLength,
data + 1 * (uBlockCount / 4) * state->uBlockLength,
data + 2 * (uBlockCount / 4) * state->uBlockLength,
data + 3 * (uBlockCount / 4) * state->uBlockLength,
1);
/*uBlockCount / 4); */
}
}
void grssFinal(grssState *state, BitSequence *grssval)
{
if(state->uHashLength == 256)
{
_mm_storeu_si128((__m128i*)grssval + 0, state->state1[0]);
_mm_storeu_si128((__m128i*)grssval + 1, state->state1[1]);
}
else
{
_mm_storeu_si128((__m128i*)grssval + 0, state->state1[0]);
_mm_storeu_si128((__m128i*)grssval + 1, state->state1[1]);
_mm_storeu_si128((__m128i*)grssval + 2, state->state1[2]);
_mm_storeu_si128((__m128i*)grssval + 3, state->state1[3]);
}
}
void Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
{
grssState hs;
grssInit(&hs, hashbitlen);
grssUpdate(&hs, data, databitlen);
grssFinal(&hs, hashval);
}