v3.21.3 Unreleased

2026-07-15 03:16:49 +00:00 · 2023-03-13 03:20:13 -04:00
parent b339450898
commit c6bc9d67fb
49 changed files with 1126 additions and 1111 deletions
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -1,6 +1,5 @@
 #include "blake2s-gate.h"
 #include "blake2s-hash-4way.h"
-//#include "sph-blake2s.h"
 #include <string.h>
 #include <stdint.h>

@@ -8,43 +7,6 @@

 static __thread blake2s_16way_state blake2s_16w_ctx;

-/*
-static blake2s_16way_state blake2s_16w_ctx;
-static uint32_t blake2s_16way_vdata[20*16] __attribute__ ((aligned (64)));
-*/
-/*
-int blake2s_16way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_16x32( blake2s_16way_vdata, edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata, 640 );
-   blake2s_16w_ctx.t[0] = 64;
-   return 1;
-}
-*/
-/*
-int blake2s_16way_prehash( struct work *work )
-{
-   mm512_bswap32_intrlv80_16x32( blake2s_16way_vdata, work->data );
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   blake2s_16way_update( &blake2s_16w_ctx, blake2s_16way_vdata, 64 );
-   return 1;
-}
-*/
-
 void blake2s_16way_hash( void *output, const void *input )
 {
   blake2s_16way_state ctx;
@@ -68,40 +30,10 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   int thr_id = mythr->id;  

-/*   
-//   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( (__m512i*)vdata +16, (__m512i*)blake2s_16way_vdata +16, 3*4*16 );
-//     casti_m512i( vdata, 16 ) = casti_m512i( blake2s_16way_vdata, 16 );
-//     casti_m512i( vdata, 17 ) = casti_m512i( blake2s_16way_vdata, 17 );
-//     casti_m512i( vdata, 18 ) = casti_m512i( blake2s_16way_vdata, 18 );
-       
-//   pthread_rwlock_unlock( &g_work_lock );
-*/
-/*
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, pdata );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_16x32( blake2s_16w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h,
-                                    ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_16x32( blake2s_16way_blake2s_16way_vdata, edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata,
-                                      edata, edata, edata, edata, 640 );
-   blake2s_16w_ctx.t[0] = 64;
-*/
-   
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
   blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );

-
   do {
      *noncev = mm512_bswap_32( _mm512_set_epi32(
 	                  n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
@@ -131,36 +63,6 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,

 static __thread blake2s_8way_state blake2s_8w_ctx;

-/*
-static blake2s_8way_state blake2s_8w_ctx;
-static uint32_t blake2s_8way_vdata[20*8] __attribute__ ((aligned (32)));
-
-int blake2s_8way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
-
-   for ( int i = 0; i < 8; i++ )
-      casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
-
-   casti_m256i( blake2s_8way_vdata, 16 ) = _mm256_set1_epi32( edata[16] );
-   casti_m256i( blake2s_8way_vdata, 17 ) = _mm256_set1_epi32( edata[17] );
-   casti_m256i( blake2s_8way_vdata, 18 ) = _mm256_set1_epi32( edata[18] );
-
-//   intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-//                                  ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-//   intrlv_8x32( blake2s_8way_vdata, edata, edata, edata, edata,
-//                                    edata, edata, edata, edata, 640 );
-   blake2s_8w_ctx.t[0] = 64;
-}
-*/
-
 void blake2s_8way_hash( void *output, const void *input )
 {
   blake2s_8way_state ctx;
@@ -184,41 +86,10 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 

-/*   
-//   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( &vdata[16*8], &blake2s_8way_vdata[16*8], 3*4*8 );
-//   pthread_rwlock_unlock( &g_work_lock );
-*/
-/*
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, pdata );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
-   for ( int i = 0; i < 8; i++ )
-      casti_m256i( blake2s_8w_ctx.h, i ) = _mm256_set1_epi32( ctx.h[i] );
-
-   casti_m256i( vdata, 16 ) = _mm256_set1_epi32( edata[16] );
-   casti_m256i( vdata, 17 ) = _mm256_set1_epi32( edata[17] );
-   casti_m256i( vdata, 18 ) = _mm256_set1_epi32( edata[18] );
-
-
-//  intrlv_8x32( blake2s_8w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h,
-//                                  ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-//   intrlv_8x32( vdata, edata, edata, edata, edata,
-//                                    edata, edata, edata, edata, 640 );
-
-   blake2s_8w_ctx.t[0] = 64;
-*/
-   
   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );

-
   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
                                                  n+3, n+2, n+1, n ) );
@@ -246,25 +117,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
 #elif defined(BLAKE2S_4WAY)

 static __thread blake2s_4way_state blake2s_4w_ctx;
-/*
-static blake2s_4way_state blake2s_4w_ctx;
-static uint32_t blake2s_4way_vdata[20*4] __attribute__ ((aligned (32)));

-int blake2s_4way_prehash( struct work *work )
-{
-   uint32_t edata[20] __attribute__ ((aligned (64)));
-   blake2s_state ctx;
-   mm128_bswap32_80( edata, work->data );
-   blake2s_init( &ctx, BLAKE2S_OUTBYTES );
-   ctx.buflen = ctx.t[0] = 64;
-   blake2s_compress( &ctx, (const uint8_t*)edata );
-
-   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
-   intrlv_4x32( blake2s_4w_ctx.h, ctx.h, ctx.h, ctx.h, ctx.h, 256 );
-   intrlv_4x32( blake2s_4way_vdata, edata, edata, edata, edata, 640 );
-   blake2s_4w_ctx.t[0] = 64;
-}
-*/
 void blake2s_4way_hash( void *output, const void *input )
 {
   blake2s_4way_state ctx;
@@ -287,15 +140,11 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 
-/*
-   pthread_rwlock_rdlock( &g_work_lock );
-       memcpy( vdata, blake2s_4way_vdata, sizeof vdata );
-   pthread_rwlock_unlock( &g_work_lock );
-*/
+
   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
-   
+
   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
      pdata[19] = n;
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -5,15 +5,13 @@ bool register_blake2s_algo( algo_gate_t* gate )
 #if defined(BLAKE2S_16WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_16way;
  gate->hash      = (void*)&blake2s_16way_hash;
-//  gate->prehash   = (void*)&blake2s_16way_prehash;
 #elif defined(BLAKE2S_8WAY)
+//#if defined(BLAKE2S_8WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_8way;
  gate->hash      = (void*)&blake2s_8way_hash;
-//  gate->prehash   = (void*)&blake2s_8way_prehash;
 #elif defined(BLAKE2S_4WAY)
  gate->scanhash  = (void*)&scanhash_blake2s_4way;
  gate->hash      = (void*)&blake2s_4way_hash;
-//  gate->prehash   = (void*)&blake2s_4way_prehash;
 #else
  gate->scanhash  = (void*)&scanhash_blake2s;
  gate->hash      = (void*)&blake2s_hash;
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -23,22 +23,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
 void blake2s_16way_hash( void *state, const void *input );
 int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_16way_prehash( struct work * );

 #elif defined (BLAKE2S_8WAY)

 void blake2s_8way_hash( void *state, const void *input );
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_8way_prehash( struct work * );

 #elif defined (BLAKE2S_4WAY)

 void blake2s_4way_hash( void *state, const void *input );
 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-int blake2s_4way_prehash( struct work * );
-
 #else

 void blake2s_hash( void *state, const void *input );
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -105,8 +105,8 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )

 #define G4W( sigma0, sigma1, a, b, c, d ) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
@@ -120,7 +120,7 @@ do { \

 #define ROUND4W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G4W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G4W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G4W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -317,8 +317,8 @@ do { \

 #define G8W( sigma0, sigma1, a, b, c, d) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
@@ -331,7 +331,7 @@ do { \

 #define ROUND8W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G8W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G8W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G8W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
@@ -529,8 +529,8 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )

 #define G16W( sigma0, sigma1, a, b, c, d) \
 do { \
-   const uint8_t s0 = sigma0; \
-   const uint8_t s1 = sigma1; \
+   uint8_t s0 = sigma0; \
+   uint8_t s1 = sigma1; \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), m[ s0 ] ); \
   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi32( c, d ); \
@@ -543,7 +543,7 @@ do { \

 #define ROUND16W(r)  \
 do { \
-   const uint8_t *sigma = (const uint8_t*)&blake2s_sigma[r]; \
+   uint8_t *sigma = (uint8_t*)&blake2s_sigma[r]; \
   G16W( sigma[ 0], sigma[ 1], v[ 0], v[ 4], v[ 8], v[12] ); \
   G16W( sigma[ 2], sigma[ 3], v[ 1], v[ 5], v[ 9], v[13] ); \
   G16W( sigma[ 4], sigma[ 5], v[ 2], v[ 6], v[10], v[14] ); \
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -20,7 +20,6 @@

 #include <stddef.h>
 #include <stdint.h>
-//#include "sph-blake2s.h"

 #if defined(_MSC_VER)
 #include <inttypes.h>
@@ -34,7 +33,7 @@
 #if defined(__cplusplus)
 extern "C" {
 #endif
-/*
+
 enum blake2s_constant
 {
   BLAKE2S_BLOCKBYTES = 64,
@@ -43,13 +42,6 @@ enum blake2s_constant
   BLAKE2S_SALTBYTES  = 8,
   BLAKE2S_PERSONALBYTES = 8
 };
-*/
-
-#define BLAKE2S_BLOCKBYTES  64
-#define BLAKE2S_OUTBYTES    32
-#define BLAKE2S_KEYBYTES    32
-#define BLAKE2S_SALTBYTES   8
-#define BLAKE2S_PERSONALBYTES  8

 #pragma pack(push, 1)
 typedef struct __blake2s_nway_param
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -8,6 +8,8 @@
 #include "sph-blake2s.h"

 static __thread blake2s_state blake2s_ctx;
+//static __thread blake2s_state s_ctx;
+#define MIDLEN 76

 void blake2s_hash( void *output, const void *input )
 {
@@ -17,27 +19,37 @@ void blake2s_hash( void *output, const void *input )
   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
   blake2s_update( &ctx, input+64, 16 );
 
+//	blake2s_init(&ctx, BLAKE2S_OUTBYTES);
+//	blake2s_update(&ctx, input, 80);
 	blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );

 	memcpy(output, hash, 32);
 }
-
+/*
+static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
+{
+	s_ctx.buflen = MIDLEN;
+	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
+	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
+	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
+}
+*/
 int scanhash_blake2s( struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;

 	uint32_t _ALIGN(64) hash64[8];
 	uint32_t _ALIGN(64) endiandata[20];
-   int thr_id = mythr->id;  
+   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];

 	uint32_t n = first_nonce;

-   swab32_array( endiandata, pdata, 20 );
+        swab32_array( endiandata, pdata, 20 );

 	// midstate
 	blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
@@ -46,12 +58,11 @@ int scanhash_blake2s( struct work *work,
 	do {
 		be32enc(&endiandata[19], n);
 		blake2s_hash( hash64, endiandata );
-      if (hash64[7] <= Htarg )
-      if ( fulltest(hash64, ptarget) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-      }
+		if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -17,7 +17,6 @@

 #include "algo/sha/sph_types.h"
 #include "sph-blake2s.h"
-#include "simd-utils.h"

 static const uint32_t blake2s_IV[8] =
 {
@@ -226,71 +225,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
 	v[13] = S->t[1] ^ blake2s_IV[5];
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];
-
-#if 0    
-//#if defined(__SSE2__) // always true
-
-The only application for this is to do a prehash for the blake2s algorithm.
-SSE2 also supports 4 way parallel hashing so that is preferred in most cases.
-Testing has found that using this serial SIMD code for prehash is slower than
-doing a parallel hash. A parallel hash has more instructions and uses more
-data. The serial hash uses fewer instructions and data and only needs to
-interleave the final hash into parallel streams. This has shown negligible
-improvement on other algos, notably blake256 which is almost identical.
-Considering the low frequency of prehash no statistically valid change
-was expected. It was simply better on paper.
-
-Furthermore, simply defining this macro has an additional negative effect on
-blake2s as a whole. There are no references to this macro, blake2s-4way does
-not include it in any header files, it's just another unused macro which should
-have no effect beyond the preprocessor. But just being visible to the compiler
-changes things in a dramatic way.
-
-These 2 things combined reduced the hash rate for blake2s by more than 5% when
-using serial SIMD for the blake2s prehash over 16way parallel prehash.
-16way parallel hashing was used in the high frequency nonce loop in both cases.
-Comsidering the prehash represents 50% of the algorithm and is done once vs
-the high frequency second half that is done mega, maybe giga, times more it's
-hard to imagine that big of an effect in either direction.
-
-#define ROUND( r ) \
-{ \
-   __m128i *V = (__m128i*)v; \
-   const uint8_t *sigma = blake2s_sigma[r]; \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                       _mm_set_epi32( m[ sigma[ 6 ] ], m[ sigma[ 4 ] ], \
-                                      m[ sigma[ 2 ] ], m[ sigma[ 0 ] ] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[ 7 ] ], m[ sigma[ 5 ] ], \
-                                       m[ sigma[ 3 ] ], m[ sigma[ 1 ] ] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[3] = mm128_shufll_32( V[3] ); \
-   V[2] = mm128_swap_64( V[2] ); \
-   V[1] = mm128_shuflr_32( V[1] ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[14] ], m[ sigma[12] ], \
-                                       m[ sigma[10] ], m[ sigma[ 8] ] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], \
-                        _mm_set_epi32( m[ sigma[15] ], m[ sigma[13] ], \
-                                       m[ sigma[11] ], m[ sigma[ 9] ] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[3] = mm128_shuflr_32( V[3] ); \
-   V[2] = mm128_swap_64( V[2] ); \
-   V[1] = mm128_shufll_32( V[1] ); \
-}
-
-#else
-
 #define G(r,i,a,b,c,d) \
 	do { \
 		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
@@ -313,10 +247,7 @@ hard to imagine that big of an effect in either direction.
 		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)
-
-#endif
-
-   ROUND( 0 );
+	ROUND( 0 );
 	ROUND( 1 );
 	ROUND( 2 );
 	ROUND( 3 );
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -91,7 +91,6 @@ static inline void secure_zero_memory(void *v, size_t n)
 extern "C" {
 #endif

-/*   
 	enum blake2s_constant
 	{
 		BLAKE2S_BLOCKBYTES = 64,
@@ -100,13 +99,6 @@ extern "C" {
 		BLAKE2S_SALTBYTES  = 8,
 		BLAKE2S_PERSONALBYTES = 8
 	};
-*/
-
-#define BLAKE2S_BLOCKBYTES  64
-#define BLAKE2S_OUTBYTES    32
-#define BLAKE2S_KEYBYTES    32
-#define BLAKE2S_SALTBYTES   8
-#define BLAKE2S_PERSONALBYTES  8

 #pragma pack(push, 1)
 	typedef struct __blake2s_param
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,16 +103,16 @@
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V2 = mm128_alignr_64( V[3], V[2], 1 ); \
-   V3 = mm128_alignr_64( V[2], V[3], 1 ); \
-   V6 = mm128_alignr_64( V[6], V[7], 1 ); \
-   V7 = mm128_alignr_64( V[7], V[6], 1 ); \
+   V2 = mm128_alignr_64( V[3], V[2] ); \
+   V3 = mm128_alignr_64( V[2], V[3] ); \
+   V6 = mm128_alignr_64( V[6], V[7] ); \
+   V7 = mm128_alignr_64( V[7], V[6] ); \
   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
-   V[2] = mm128_alignr_64( V2, V3, 1 ); \
-   V[3] = mm128_alignr_64( V3, V2, 1 ); \
-   V[6] = mm128_alignr_64( V7, V6, 1 ); \
-   V[7] = mm128_alignr_64( V6, V7, 1 ); \
+   V[2] = mm128_alignr_64( V2, V3 ); \
+   V[3] = mm128_alignr_64( V3, V2 ); \
+   V[6] = mm128_alignr_64( V7, V6 ); \
+   V[7] = mm128_alignr_64( V6, V7 ); \
 }

 #else