This commit is contained in:
Jay D Dee
2021-07-26 15:01:37 -04:00
parent 92b3733925
commit 9b905fccc8
33 changed files with 889 additions and 565 deletions

View File

@@ -104,11 +104,9 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2(i, j, k){\
j = _mm512_xor_si512(j, j);\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask( m512_zero, i) );\
i = _mm512_add_epi8(i, i);\
j = _mm512_and_si512(j, k);\
i = _mm512_xor_si512(i, j);\
i = mm512_xorand( i, j, k );\
}
/**/
@@ -130,100 +128,90 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
We almost fit into 16 registers, need only 3 spills to memory.
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
K. Matusiewicz, 2011/05/29 */
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
#define MixBytes( a0, a1, a2, a3, a4, a5, a6, a7, \
b0, b1, b2, b3, b4, b5, b6, b7) { \
/* t_i = a_i + a_{i+1} */\
b6 = a0;\
b7 = a1;\
a0 = _mm512_xor_si512(a0, a1);\
b0 = a2;\
a1 = _mm512_xor_si512(a1, a2);\
b1 = a3;\
a2 = _mm512_xor_si512(a2, a3);\
b2 = a4;\
a3 = _mm512_xor_si512(a3, a4);\
b3 = a5;\
a4 = _mm512_xor_si512(a4, a5);\
b4 = a6;\
a5 = _mm512_xor_si512(a5, a6);\
b5 = a7;\
a6 = _mm512_xor_si512(a6, a7);\
a7 = _mm512_xor_si512(a7, b6);\
b6 = a0; \
b7 = a1; \
a0 = _mm512_xor_si512( a0, a1 ); \
b0 = a2; \
a1 = _mm512_xor_si512( a1, a2 ); \
b1 = a3; \
TEMP2 = _mm512_xor_si512( a2, a3 ); \
b2 = a4; \
a3 = _mm512_xor_si512( a3, a4 ); \
b3 = a5; \
a4 = _mm512_xor_si512( a4, a5 );\
b4 = a6; \
a5 = _mm512_xor_si512( a5, a6 ); \
b5 = a7; \
a6 = _mm512_xor_si512( a6, a7 ); \
a7 = _mm512_xor_si512( a7, b6 ); \
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm512_xor_si512(b0, a4);\
b6 = _mm512_xor_si512(b6, a4);\
b1 = _mm512_xor_si512(b1, a5);\
b7 = _mm512_xor_si512(b7, a5);\
b2 = _mm512_xor_si512(b2, a6);\
b0 = _mm512_xor_si512(b0, a6);\
TEMP0 = mm512_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm512_xor_si512(b3, a7);\
b1 = _mm512_xor_si512(b1, a7);\
TEMP1 = b1;\
b4 = _mm512_xor_si512(b4, a0);\
b2 = _mm512_xor_si512(b2, a0);\
TEMP1 = mm512_xor3( b1, a5, a7 ); \
b2 = mm512_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm512_xor_si512(b5, a1);\
b3 = _mm512_xor_si512(b3, a1);\
b1 = a1;\
b6 = _mm512_xor_si512(b6, a2);\
b4 = _mm512_xor_si512(b4, a2);\
TEMP2 = a2;\
b7 = _mm512_xor_si512(b7, a3);\
b5 = _mm512_xor_si512(b5, a3);\
b0 = a0; \
b3 = mm512_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm512_xor3( b6, a4, TEMP2 ); \
b4 = mm512_xor3( b4, a0, TEMP2 ); \
b7 = mm512_xor3( b7, a5, a3 ); \
b5 = mm512_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm512_xor_si512(a0, a3);\
a1 = _mm512_xor_si512(a1, a4);\
a2 = _mm512_xor_si512(a2, a5);\
a3 = _mm512_xor_si512(a3, a6);\
a4 = _mm512_xor_si512(a4, a7);\
a5 = _mm512_xor_si512(a5, b0);\
a6 = _mm512_xor_si512(a6, b1);\
a7 = _mm512_xor_si512(a7, TEMP2);\
a0 = _mm512_xor_si512( a0, a3 ); \
a1 = _mm512_xor_si512( a1, a4 ); \
a2 = _mm512_xor_si512( TEMP2, a5 ); \
a3 = _mm512_xor_si512( a3, a6 ); \
a4 = _mm512_xor_si512( a4, a7 ); \
a5 = _mm512_xor_si512( a5, b0 ); \
a6 = _mm512_xor_si512( a6, b1 ); \
a7 = _mm512_xor_si512( a7, TEMP2 ); \
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm512_xor_si512(a0, TEMP0);\
MUL2(a1, b0, b1);\
a1 = _mm512_xor_si512(a1, TEMP1);\
MUL2(a2, b0, b1);\
a2 = _mm512_xor_si512(a2, b2);\
MUL2(a3, b0, b1);\
a3 = _mm512_xor_si512(a3, b3);\
MUL2(a4, b0, b1);\
a4 = _mm512_xor_si512(a4, b4);\
MUL2(a5, b0, b1);\
a5 = _mm512_xor_si512(a5, b5);\
MUL2(a6, b0, b1);\
a6 = _mm512_xor_si512(a6, b6);\
MUL2(a7, b0, b1);\
a7 = _mm512_xor_si512(a7, b7);\
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
MUL2( a0, b0, b1 ); \
a0 = _mm512_xor_si512( a0, TEMP0 ); \
MUL2( a1, b0, b1 ); \
a1 = _mm512_xor_si512( a1, TEMP1 ); \
MUL2( a2, b0, b1 ); \
a2 = _mm512_xor_si512( a2, b2 ); \
MUL2( a3, b0, b1 ); \
a3 = _mm512_xor_si512( a3, b3 ); \
MUL2( a4, b0, b1 ); \
a4 = _mm512_xor_si512( a4, b4 ); \
MUL2( a5, b0, b1 ); \
a5 = _mm512_xor_si512( a5, b5 ); \
MUL2( a6, b0, b1 ); \
a6 = _mm512_xor_si512( a6, b6 ); \
MUL2( a7, b0, b1 ); \
a7 = _mm512_xor_si512( a7, b7 ); \
\
/* compute v_i : double w_i */\
/* add to y_4 y_5 .. v3, v4, ... */\
MUL2(a0, b0, b1);\
b5 = _mm512_xor_si512(b5, a0);\
MUL2(a1, b0, b1);\
b6 = _mm512_xor_si512(b6, a1);\
MUL2(a2, b0, b1);\
b7 = _mm512_xor_si512(b7, a2);\
MUL2(a5, b0, b1);\
b2 = _mm512_xor_si512(b2, a5);\
MUL2(a6, b0, b1);\
b3 = _mm512_xor_si512(b3, a6);\
MUL2(a7, b0, b1);\
b4 = _mm512_xor_si512(b4, a7);\
MUL2(a3, b0, b1);\
MUL2(a4, b0, b1);\
MUL2( a0, b0, b1 ); \
b5 = _mm512_xor_si512( b5, a0 ); \
MUL2( a1, b0, b1 ); \
b6 = _mm512_xor_si512( b6, a1 ); \
MUL2( a2, b0, b1 ); \
b7 = _mm512_xor_si512( b7, a2 ); \
MUL2( a5, b0, b1 ); \
b2 = _mm512_xor_si512( b2, a5 ); \
MUL2( a6, b0, b1 ); \
b3 = _mm512_xor_si512( b3, a6 ); \
MUL2( a7, b0, b1 ); \
b4 = _mm512_xor_si512( b4, a7 ); \
MUL2( a3, b0, b1 ); \
MUL2( a4, b0, b1 ); \
b0 = TEMP0;\
b1 = TEMP1;\
b0 = _mm512_xor_si512(b0, a3);\
b1 = _mm512_xor_si512(b1, a4);\
b0 = _mm512_xor_si512( b0, a3 ); \
b1 = _mm512_xor_si512( b1, a4 ); \
}/*MixBytes*/
/* one round
@@ -709,11 +697,9 @@ static const __m256i SUBSH_MASK7_2WAY =
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
#define MUL2_2WAY(i, j, k){\
j = _mm256_xor_si256(j, j);\
j = _mm256_cmpgt_epi8(j, i );\
j = _mm256_cmpgt_epi8( m256_zero, i );\
i = _mm256_add_epi8(i, i);\
j = _mm256_and_si256(j, k);\
i = _mm256_xor_si256(i, j);\
i = mm256_xorand( i, j, k );\
}
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\