mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
494 lines
17 KiB
C
494 lines
17 KiB
C
#include "shavite-hash-4way.h"
|
|
#include <stdint.h>
|
|
|
|
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
|
|
|
static const uint32_t IV512[] =
|
|
{
|
|
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
|
|
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
|
|
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
|
|
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
|
};
|
|
|
|
#define mm512_ror2x512hi_1x32( a, b ) \
|
|
_mm512_mask_blend_epi32( 0x8888, mm512_shuflr128_32( a ), \
|
|
mm512_shuflr128_32( b ) )
|
|
|
|
static void
|
|
c512_4way( shavite512_4way_context *ctx, const void *msg )
|
|
{
|
|
register __m512i X;
|
|
register __m512i P0, P1, P2, P3;
|
|
register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
|
|
__m512i *M = (__m512i*)msg;
|
|
__m512i *H = (__m512i*)ctx->h;
|
|
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
|
|
ctx->count1, ctx->count0 );
|
|
int r;
|
|
|
|
P0 = H[0];
|
|
P1 = H[1];
|
|
P2 = H[2];
|
|
P3 = H[3];
|
|
|
|
K0 = M[0];
|
|
K1 = M[1];
|
|
K2 = M[2];
|
|
K3 = M[3];
|
|
K4 = M[4];
|
|
K5 = M[5];
|
|
K6 = M[6];
|
|
K7 = M[7];
|
|
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
|
|
|
P0 = _mm512_xor_si512( P0, X );
|
|
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
|
|
|
P2 = _mm512_xor_si512( P2, X );
|
|
|
|
// round
|
|
for ( r = 0; r < 3; r ++ )
|
|
{
|
|
// round 1, 5, 9
|
|
|
|
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K0, m512_zero ) ) );
|
|
|
|
if ( r == 0 )
|
|
K0 = _mm512_xor_si512( K0,
|
|
_mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
|
|
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
|
K1 = _mm512_xor_si512( K0,
|
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
|
|
|
|
if ( r == 1 )
|
|
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
|
|
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
|
|
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
|
K2 = _mm512_xor_si512( K1,
|
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
|
K3 = _mm512_xor_si512( K2,
|
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
|
|
|
P3 = _mm512_xor_si512( P3, X );
|
|
|
|
K4 = _mm512_xor_si512( K3,
|
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
|
K5 = _mm512_xor_si512( K4,
|
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
|
K6 = _mm512_xor_si512( K5,
|
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
|
K7 = _mm512_xor_si512( K6,
|
|
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
|
|
|
|
if ( r == 2 )
|
|
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
|
|
_mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
|
|
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
|
P1 = _mm512_xor_si512( P1, X );
|
|
|
|
// round 2, 6, 10
|
|
|
|
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
|
|
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
|
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
|
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
|
|
|
P2 = _mm512_xor_si512( P2, X );
|
|
|
|
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
|
|
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
|
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
|
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
|
|
|
P0 = _mm512_xor_si512( P0, X );
|
|
|
|
// round 3, 7, 11
|
|
|
|
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
|
|
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
|
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
|
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
|
|
|
P1 = _mm512_xor_si512( P1, X );
|
|
|
|
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
|
|
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
|
K6 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
|
K7 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
|
|
|
P3 = _mm512_xor_si512( P3, X );
|
|
|
|
// round 4, 8, 12
|
|
|
|
K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
|
|
K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
|
K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
|
K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
|
|
|
P0 = _mm512_xor_si512( P0, X );
|
|
|
|
K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
|
|
K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
|
K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
|
K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
|
|
|
P2 = _mm512_xor_si512( P2, X );
|
|
}
|
|
|
|
// round 13
|
|
|
|
K0 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
|
|
K1 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
|
|
K2 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
|
|
K3 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
|
|
|
|
P3 = _mm512_xor_si512( P3, X );
|
|
|
|
K4 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
|
|
K5 = _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
|
|
|
|
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
|
|
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
|
|
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
|
|
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
|
|
K7= _mm512_xor_si512( mm512_shuflr128_32(
|
|
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
|
|
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
|
|
|
|
P1 = _mm512_xor_si512( P1, X );
|
|
|
|
H[0] = _mm512_xor_si512( H[0], P2 );
|
|
H[1] = _mm512_xor_si512( H[1], P3 );
|
|
H[2] = _mm512_xor_si512( H[2], P0 );
|
|
H[3] = _mm512_xor_si512( H[3], P1 );
|
|
}
|
|
|
|
void shavite512_4way_init( shavite512_4way_context *ctx )
|
|
{
|
|
__m512i *h = (__m512i*)ctx->h;
|
|
__m128i *iv = (__m128i*)IV512;
|
|
|
|
h[0] = m512_const1_128( iv[0] );
|
|
h[1] = m512_const1_128( iv[1] );
|
|
h[2] = m512_const1_128( iv[2] );
|
|
h[3] = m512_const1_128( iv[3] );
|
|
|
|
ctx->ptr = 0;
|
|
ctx->count0 = 0;
|
|
ctx->count1 = 0;
|
|
ctx->count2 = 0;
|
|
ctx->count3 = 0;
|
|
}
|
|
|
|
// not tested, use update_close
|
|
void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
|
|
size_t len )
|
|
{
|
|
unsigned char *buf = ctx->buf;
|
|
size_t ptr = ctx->ptr;
|
|
|
|
while ( len > 0 )
|
|
{
|
|
size_t clen;
|
|
|
|
clen = (sizeof ctx->buf) - ptr;
|
|
if ( clen > len << 2 )
|
|
clen = len << 2;
|
|
memcpy( buf + ptr, data, clen );
|
|
data = (const unsigned char *)data + clen;
|
|
ptr += clen;
|
|
len -= clen >> 2;
|
|
if ( ptr == sizeof ctx->buf )
|
|
{
|
|
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
|
|
{
|
|
ctx->count1 = ctx->count1 + 1;
|
|
if ( ctx->count1 == 0 )
|
|
{
|
|
ctx->count2 = ctx->count2 + 1;
|
|
if ( ctx->count2 == 0 )
|
|
ctx->count3 = ctx->count3 + 1;
|
|
}
|
|
}
|
|
c512_4way( ctx, buf );
|
|
ptr = 0;
|
|
}
|
|
}
|
|
ctx->ptr = ptr;
|
|
}
|
|
|
|
// not tested
|
|
void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
|
|
{
|
|
unsigned char *buf;
|
|
union
|
|
{
|
|
uint32_t u32[4];
|
|
uint16_t u16[8];
|
|
} count;
|
|
|
|
buf = ctx->buf;
|
|
uint32_t vp = ctx->ptr>>6;
|
|
|
|
// Terminating byte then zero pad
|
|
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
|
|
|
// Zero pad full vectors up to count
|
|
for ( ; vp < 6; vp++ )
|
|
casti_m512i( buf, vp ) = m512_zero;
|
|
|
|
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
|
|
// Count is misaligned to 16 bits and straddles a vector.
|
|
// Use u32 overlay to stage then u16 to load buf.
|
|
count.u32[0] = ctx->count0 += (ctx->ptr << 1); // ptr/4 * 8
|
|
count.u32[1] = ctx->count1;
|
|
count.u32[2] = ctx->count2;
|
|
count.u32[3] = ctx->count3;
|
|
|
|
casti_m512i( buf, 6 ) = m512_const1_128(
|
|
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
|
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
|
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
|
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
|
|
|
c512_4way( ctx, buf);
|
|
|
|
casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
|
|
casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
|
|
casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
|
|
casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
|
|
}
|
|
|
|
void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
|
|
const void *data, size_t len )
|
|
{
|
|
unsigned char *buf = ctx->buf;
|
|
size_t ptr = ctx->ptr;
|
|
|
|
// process full blocks and load buf with remainder.
|
|
while ( len > 0 )
|
|
{
|
|
size_t clen;
|
|
|
|
clen = (sizeof ctx->buf) - ptr;
|
|
if ( clen > len << 2 )
|
|
clen = len << 2;
|
|
memcpy( buf + ptr, data, clen );
|
|
data = (const unsigned char *)data + clen;
|
|
ptr += clen;
|
|
len -= (clen >> 2);
|
|
if ( ptr == sizeof ctx->buf )
|
|
{
|
|
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
|
|
{
|
|
ctx->count1 = ctx->count1 + 1;
|
|
if ( ctx->count1 == 0 )
|
|
{
|
|
ctx->count2 = ctx->count2 + 1;
|
|
if ( ctx->count2 == 0 )
|
|
ctx->count3 = ctx->count3 + 1;
|
|
}
|
|
}
|
|
c512_4way( ctx, buf );
|
|
ptr = 0;
|
|
}
|
|
}
|
|
|
|
uint32_t vp = ptr>>6;
|
|
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
|
|
// Count is misaligned to 16 bits and straddles 2 vectors.
|
|
// Use u32 overlay to stage then u16 to load buf.
|
|
union
|
|
{
|
|
uint32_t u32[4];
|
|
uint16_t u16[8];
|
|
} count;
|
|
|
|
count.u32[0] = ctx->count0 += (ptr << 1); // ptr/4 * 8
|
|
count.u32[1] = ctx->count1;
|
|
count.u32[2] = ctx->count2;
|
|
count.u32[3] = ctx->count3;
|
|
|
|
if ( vp == 0 ) // empty buf, xevan.
|
|
{
|
|
casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
|
|
memset_zero_512( (__m512i*)buf + 1, 5 );
|
|
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
|
}
|
|
else // half full buf, everyone else.
|
|
{
|
|
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
|
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
|
}
|
|
|
|
casti_m512i( buf, 6 ) = m512_const1_128(
|
|
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
|
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
|
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
|
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
|
|
|
c512_4way( ctx, buf);
|
|
|
|
casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
|
|
casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
|
|
casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
|
|
casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
|
|
}
|
|
|
|
|
|
void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
|
|
const void *data, size_t len )
|
|
{
|
|
__m512i *h = (__m512i*)ctx->h;
|
|
__m128i *iv = (__m128i*)IV512;
|
|
|
|
h[0] = m512_const1_128( iv[0] );
|
|
h[1] = m512_const1_128( iv[1] );
|
|
h[2] = m512_const1_128( iv[2] );
|
|
h[3] = m512_const1_128( iv[3] );
|
|
|
|
ctx->ptr =
|
|
ctx->count0 =
|
|
ctx->count1 =
|
|
ctx->count2 =
|
|
ctx->count3 = 0;
|
|
|
|
unsigned char *buf = ctx->buf;
|
|
size_t ptr = ctx->ptr;
|
|
|
|
// process full blocks and load buf with remainder.
|
|
while ( len > 0 )
|
|
{
|
|
size_t clen;
|
|
|
|
clen = (sizeof ctx->buf) - ptr;
|
|
if ( clen > len << 2 )
|
|
clen = len << 2;
|
|
memcpy( buf + ptr, data, clen );
|
|
data = (const unsigned char *)data + clen;
|
|
ptr += clen;
|
|
len -= (clen >> 2);
|
|
if ( ptr == sizeof ctx->buf )
|
|
{
|
|
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
|
|
{
|
|
ctx->count1 = ctx->count1 + 1;
|
|
if ( ctx->count1 == 0 )
|
|
{
|
|
ctx->count2 = ctx->count2 + 1;
|
|
if ( ctx->count2 == 0 )
|
|
ctx->count3 = ctx->count3 + 1;
|
|
}
|
|
}
|
|
c512_4way( ctx, buf );
|
|
ptr = 0;
|
|
}
|
|
}
|
|
|
|
uint32_t vp = ptr>>6;
|
|
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
|
|
// Count is misaligned to 16 bits and straddles 2 vectors.
|
|
// Use u32 overlay to stage then u16 to load buf.
|
|
union
|
|
{
|
|
uint32_t u32[4];
|
|
uint16_t u16[8];
|
|
} count;
|
|
|
|
count.u32[0] = ctx->count0 += (ptr << 1); // ptr/4 * 8
|
|
count.u32[1] = ctx->count1;
|
|
count.u32[2] = ctx->count2;
|
|
count.u32[3] = ctx->count3;
|
|
|
|
if ( vp == 0 ) // empty buf, xevan.
|
|
{
|
|
casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
|
|
memset_zero_512( (__m512i*)buf + 1, 5 );
|
|
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
|
|
}
|
|
else // half full buf, everyone else.
|
|
{
|
|
casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
|
|
memset_zero_512( (__m512i*)buf + vp, 6 - vp );
|
|
}
|
|
|
|
casti_m512i( buf, 6 ) = m512_const1_128(
|
|
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
|
casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
|
|
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
|
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
|
|
|
c512_4way( ctx, buf);
|
|
|
|
casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
|
|
casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
|
|
casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
|
|
casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
|
|
}
|
|
|
|
|
|
#endif // VAES
|