v3.21.3 Unreleased

This commit is contained in:
Jay D Dee
2023-03-13 03:20:13 -04:00
parent b339450898
commit c6bc9d67fb
49 changed files with 1126 additions and 1111 deletions

View File

@@ -1,6 +1,5 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
//#include "sph-blake2s.h"
#include <string.h>
#include <stdint.h>
@@ -8,43 +7,6 @@
static __thread blake2s_16way_state blake2s_16w_ctx;
/*
static blake2s_16way_state blake2s_16w_ctx;
static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
*/
/*
int blake2s_16way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
return 1;
}
*/
/*
int blake2s_16way_prehash( struct work *work )
{
mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
return 1;
}
*/
void blake2s_16way_hash( void *output, const void *input )
{
blake2s_16way_state ctx;
@@ -68,40 +30,10 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
// casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
// casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
// casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h,
ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_16w_ctx.t[0] = 64;
*/
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
*noncev = mm512_bswap_32( _mm512_set_epi32(
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
@@ -131,36 +63,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
static __thread blake2s_8way_state blake2s_8w_ctx;
/*
static blake2s_8way_state blake2s_8w_ctx;
static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
int blake2s_8way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
}
*/
void blake2s_8way_hash( void *output, const void *input )
{
blake2s_8way_state ctx;
@@ -184,41 +86,10 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
// pthread_rwlock_rdlock( &g_work_lock );
memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
// pthread_rwlock_unlock( &g_work_lock );
*/
/*
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, pdata );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
for ( int i = 0; i < 8; i++ )
casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
// intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
// ctx.h, ctx.h, ctx.h, ctx.h, 256 );
// intrlv_8x32( vdata, edata, edata, edata, edata,
// edata, edata, edata, edata, 640 );
blake2s_8w_ctx.t[0] = 64;
*/
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
@@ -246,25 +117,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
/*
static blake2s_4way_state blake2s_4w_ctx;
static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));
int blake2s_4way_prehash( struct work *work )
{
uint32_t edata[20] __attribute__ ((aligned (64)));
blake2s_state ctx;
mm128_bswap32_80( edata, work->data );
blake2s_init( &ctx, BLAKE2S_OUTBYTES );
ctx.buflen = ctx.t[0] = 64;
blake2s_compress( &ctx, (const uint8_t*)edata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
blake2s_4w_ctx.t[0] = 64;
}
*/
void blake2s_4way_hash( void *output, const void *input )
{
blake2s_4way_state ctx;
@@ -287,15 +140,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
__m128i *noncev = (__m128i*)vdata + 19; // aligned
uint32_t n = first_nonce;
int thr_id = mythr->id;
/*
pthread_rwlock_rdlock( &g_work_lock );
memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
pthread_rwlock_unlock( &g_work_lock );
*/
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
pdata[19] = n;

View File

@@ -5,15 +5,13 @@ bool register_blake2s_algo( algo_gate_t* gate )
#if defined(BLAKE2S_16WAY)
gate->scanhash = (void*)&scanhash_blake2s_16way;
gate->hash = (void*)&blake2s_16way_hash;
// gate->prehash = (void*)&blake2s_16way_prehash;
#elif defined(BLAKE2S_8WAY)
//#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
// gate->prehash = (void*)&blake2s_8way_prehash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
// gate->prehash = (void*)&blake2s_4way_prehash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;

View File

@@ -23,22 +23,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
void blake2s_16way_hash( void *state, const void *input );
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_16way_prehash( struct work * );
#elif defined (BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_8way_prehash( struct work * );
#elif defined (BLAKE2S_4WAY)
void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int blake2s_4way_prehash( struct work * );
#else
void blake2s_hash( void *state, const void *input );

View File

@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
@@ -120,7 +120,7 @@ do { \
#define ROUND4W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -317,8 +317,8 @@ do { \
#define G8W( sigma0, sigma1, a, b, c, d) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
c = _mm256_add_epi32( c, d ); \
@@ -331,7 +331,7 @@ do { \
#define ROUND8W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
#define G16W( sigma0, sigma1, a, b, c, d) \
do { \
const uint8_t s0 = sigma0; \
const uint8_t s1 = sigma1; \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi32( c, d ); \
@@ -543,7 +543,7 @@ do { \
#define ROUND16W(r) \
do { \
const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \

View File

@@ -20,7 +20,6 @@
#include <stddef.h>
#include <stdint.h>
//#include "sph-blake2s.h"
#if defined(_MSC_VER)
#include <inttypes.h>
@@ -34,7 +33,7 @@
#if defined(__cplusplus)
extern "C" {
#endif
/*
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
@@ -43,13 +42,6 @@ enum blake2s_constant
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
*/
#define BLAKE2S_BLOCKBYTES 64
#define BLAKE2S_OUTBYTES 32
#define BLAKE2S_KEYBYTES 32
#define BLAKE2S_SALTBYTES 8
#define BLAKE2S_PERSONALBYTES 8
#pragma pack(push, 1)
typedef struct __blake2s_nway_param

View File

@@ -8,6 +8,8 @@
#include "sph-blake2s.h"
static __thread blake2s_state blake2s_ctx;
//static __thread blake2s_state s_ctx;
#define MIDLEN 76
void blake2s_hash( void *output, const void *input )
{
@@ -17,27 +19,37 @@ void blake2s_hash( void *output, const void *input )
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
// blake2s_update(&ctx, input, 80);
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
memcpy(output, hash, 32);
}
/*
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
{
s_ctx.buflen = MIDLEN;
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
}
*/
int scanhash_blake2s( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) hash64[8];
uint32_t _ALIGN(64) endiandata[20];
int thr_id = mythr->id;
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
swab32_array( endiandata, pdata, 20 );
swab32_array( endiandata, pdata, 20 );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
@@ -46,12 +58,11 @@ int scanhash_blake2s( struct work *work,
do {
be32enc(&endiandata[19], n);
blake2s_hash( hash64, endiandata );
if (hash64[7] <= Htarg )
if ( fulltest(hash64, ptarget) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return true;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -17,7 +17,6 @@
#include "algo/sha/sph_types.h"
#include "sph-blake2s.h"
#include "simd-utils.h"
static const uint32_t blake2s_IV[8] =
{
@@ -226,71 +225,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
v[13] = S->t[1] ^ blake2s_IV[5];
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#if 0
//#if defined(__SSE2__) // always true
The only application for this is to do a prehash for the blake2s algorithm.
SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
Testing has found that using this serial SIMD code for prehash is slower than
doing a parallel hash. A parallel hash has more instructions and uses more
data. The serial hash uses fewer instructions and data and only needs to
interleave the final hash into parallel streams. This has shown negligible
improvement on other algos, notably blake256 which is almost identical.
Considering the low frequency of prehash no statistically valid change
was expected. It was simply better on paper.
Furthermore, simply defining this macro has an additional negative effect on
blake2s as a whole. There are no references to this macro, blake2s-4way does
not include it in any header files, it's just another unused macro which should
have no effect beyond the preprocessor. But just being visible to the compiler
changes things in a dramatic way.
These 2 things combined reduced the hash rate for blake2s by more than 5% when
using serial SIMD for the blake2s prehash over 16way parallel prehash.
16way parallel hashing was used in the high frequency nonce loop in both cases.
Comsidering the prehash represents 50% of the algorithm and is done once vs
the high frequency second half that is done mega, maybe giga, times more it's
hard to imagine that big of an effect in either direction.
#define ROUND( r ) \
{ \
__m128i *V = (__m128i*)v; \
const uint8_t *sigma = blake2s_sigma[r]; \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[3] = mm128_shufll_32( V[3] ); \
V[2] = mm128_swap_64( V[2] ); \
V[1] = mm128_shuflr_32( V[1] ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
_mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
V[2] = _mm_add_epi32( V[2], V[3] ); \
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
V[3] = mm128_shuflr_32( V[3] ); \
V[2] = mm128_swap_64( V[2] ); \
V[1] = mm128_shufll_32( V[1] ); \
}
#else
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -313,10 +247,7 @@ hard to imagine that big of an effect in either direction.
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
} while(0)
#endif
ROUND( 0 );
ROUND( 0 );
ROUND( 1 );
ROUND( 2 );
ROUND( 3 );

View File

@@ -91,7 +91,6 @@ static inline void secure_zero_memory(void *v, size_t n)
extern "C" {
#endif
/*
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
@@ -100,13 +99,6 @@ extern "C" {
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
*/
#define BLAKE2S_BLOCKBYTES 64
#define BLAKE2S_OUTBYTES 32
#define BLAKE2S_KEYBYTES 32
#define BLAKE2S_SALTBYTES 8
#define BLAKE2S_PERSONALBYTES 8
#pragma pack(push, 1)
typedef struct __blake2s_param

View File

@@ -103,16 +103,16 @@
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
V2 = mm128_alignr_64( V[3], V[2] ); \
V3 = mm128_alignr_64( V[2], V[3] ); \
V6 = mm128_alignr_64( V[6], V[7] ); \
V7 = mm128_alignr_64( V[7], V[6] ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
V[2] = mm128_alignr_64( V2, V3 ); \
V[3] = mm128_alignr_64( V3, V2 ); \
V[6] = mm128_alignr_64( V7, V6 ); \
V[7] = mm128_alignr_64( V6, V7 ); \
}
#else