v3.9.6.2

2026-02-22 16:33:08 +00:00 · 2019-07-30 10:16:43 -04:00
parent a51f59086b
commit 9d49e0be7a
66 changed files with 1949 additions and 1470 deletions
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -308,12 +308,12 @@ static const sph_u32 CS[16] = {
 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                 _mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \
+                                   _mm_set1_epi32( c1 ), m0 ), b ), a ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                 _mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \
+                                   _mm_set1_epi32( c0 ), m1 ), b ), a ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
@@ -508,14 +508,18 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
-   V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
-   VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
-   VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
-   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
-   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
-   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
-   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
+   V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
+   V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
+   VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
+   VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
+                           m128_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
+                           m128_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
+                           m128_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
+                           m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
   BLAKE256_4WAY_BLOCK_BSWAP32; \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
@@ -631,16 +635,20 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
-   V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
-   VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
-   VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
-   shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
-                                     0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
+   V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
+   VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
+   VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
+   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
+                              m256_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
+                              m256_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
+                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
+                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
@@ -696,14 +704,14 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
   __m128i zero = m128_zero;
-   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
-   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
-   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
-   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
-   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
-   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
-   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
-   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
+   casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
+   casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
+   casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
+   casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
+   casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );

   casti_m128i( ctx->S, 0 ) = zero;
   casti_m128i( ctx->S, 1 ) = zero;
@@ -778,12 +786,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
   else
      ctx->T0 -= 512 - bit_len;

-   buf[vptr] = _mm_set1_epi32( 0x80 );
+   buf[vptr] = m128_const1_64( 0x0000008000000080 );

   if ( vptr < 12 )
   {
      memset_zero_128( buf + vptr + 1, 13 - vptr  );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
+                                m128_const1_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
@@ -795,7 +804,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      ctx->T0 = 0xFFFFFE00UL;
      ctx->T1 = 0xFFFFFFFFUL;
      memset_zero_128( buf, 56>>2 );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
+                                m128_const1_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
      blake32_4way( ctx, buf, 64 );
@@ -815,20 +825,18 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
   __m256i zero = m256_zero;
-   casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] );
-   casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] );
-   casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] );
-   casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] );
-   casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] );
-   casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] );
-   casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] );
-   casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] );
-
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
@@ -887,7 +895,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
+   buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -909,7 +917,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm256_or_si256( buf[52>>2],
-                                           _mm256_set1_epi32( 0x01000000UL ) );
+                                m256_const1_64( 0x0100000001000000ULL ) );
       *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
       *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
@@ -922,7 +930,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
        sc->T1 = SPH_C32(0xFFFFFFFFUL);
        memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
-           buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
+           buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
        *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
        blake32_8way( sc, buf, 64 );
--- a/algo/blake/blake256-hash-4way.c.new
+++ b/algo/blake/blake256-hash-4way.c.new
@@ -1,322 +0,0 @@
-// convert blake256 32 bit to use 64 bit with serial vectoring
-//
-//  cut calls to GS in half
-//
-// combine V
-// v0 = {V0,V1}
-// v1 = {V2,V3}
-// v2 = {V4,V5}
-// v3 = {V6,V7}
-// v4 = {V8,V9}
-// v5 = {VA,VB}
-// v6 = {VC,VD}
-// v7 = {CE,VF}
-//
-// v6x = {VD,VC}      swap(VC,VD)   swap(v6)
-// v7x = {VF,VE}      swap(VE,VF)   swap(v7)
-//
-// V0 = v1v0
-// V1 = v3v2
-// V2 = v5v4
-// V3 = v7v6
-// V4 = v9v8
-// V5 = vbva
-// V6 = vdvc
-// V7 = vfve
-//
-// The rotate in ROUND is to effect straddle and unstraddle for the third
-// and 4th iteration of GS.
-// It concatenates 2 contiguous 256 bit vectors and extracts the middle
-// 256 bits. After the transform they must be restored with only the
-// chosen bits modified in the original 2 vectors.
-// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
-// 256 bit vector and saves the untouched bits temporailly in arg0, the
-// "high" 256 bit vector. Simply reverse the process to restore data back
-// to original positions.
-
-// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
-//
-// Data is organised the same as 32 bit 4 way, in effect serial vectoring
-// on top of parallel vectoring. Same data in the same place just taking
-// two chunks at a time.
-//
-// Transparent to user, x2 mode used when AVX2 detected.
-// Use existing 4way context but revert to scalar types.
-// Same interleave function (128 bit) or x2 with 256 bit?
-// User trsnaparency would have to apply to interleave as well.
-//
-// Use common 4way update and close
-
-/*
-typedef struct {
-   unsigned char buf[64<<2];
-   uint32_t H[8<<2];
-   uint32_t S[4<<2];
-   size_t ptr;
-   uint32_t T0, T1;
-   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
-} blakex2_4way_small_context __attribute__ ((aligned (64)));
-*/
-
-static void
-blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
-{
-   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
-   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
-   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
-   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
-   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
-   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
-   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
-   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
-
-   casti_m128i( ctx->S, 0 ) = m128_zero;
-   casti_m128i( ctx->S, 1 ) = m128_zero;
-   casti_m128i( ctx->S, 2 ) = m128_zero;
-   casti_m128i( ctx->S, 3 ) = m128_zero;
-/*
-   sc->S[0] = _mm_set1_epi32( salt[0] );
-   sc->S[1] = _mm_set1_epi32( salt[1] );
-   sc->S[2] = _mm_set1_epi32( salt[2] );
-   sc->S[3] = _mm_set1_epi32( salt[3] );
-*/
-   ctx->T0 = ctx->T1 = 0;
-   ctx->ptr = 0;
-   ctx->rounds = rounds;
-}
-
-static void
-blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
-{
-   __m128i *buf = (__m256i*)ctx->buf;
-   size_t  bptr = ctx->ptr << 2;
-   size_t  vptr = ctx->ptr >> 3;
-   size_t  blen = len << 2;
-//    unsigned char *buf = ctx->buf;
-//    size_t ptr         = ctx->ptr<<4;  // repurposed
-    DECL_STATE32x2
-
-//    buf = sc->buf;
-//    ptr = sc->ptr;
-
-// adjust len for use with ptr, clen, all absolute bytes.
-//    int blen = len<<2;
-
-    if ( blen < (sizeof ctx->buf) - bptr )
-    {
-        memcpy( buf + vptr, data, blen );
-        ptr += blen;
-        ctx->ptr = bptr >> 2;;
-        return;
-    }
-
-    READ_STATE32( ctx );
-    while ( blen > 0 )
-    {
-        size_t clen;
-
-        clen = ( sizeof sc->buf ) - ptr;
-        if ( clen > blen )
-            clen = blen;
-        memcpy( buf + vptr, data, clen );
-        bptr += clen;
-        vptr = bptr >> 5;
-	data = (const unsigned char *)data + clen;
-        blen -= clen;
-        if ( bptr == sizeof ctx->buf )
-       	{
-           if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
-               T1 += 1;
-           COMPRESS32x2_4WAY( ctx->rounds );
-           ptr = 0;
-        }
-    }
-    WRITE_STATE32x2( ctx );
-    ctx->ptr = bptr >> 2;
-}
-
-static void
-blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
-{
-   __m256i buf[8] __attribute__ ((aligned (64)));
-   size_t   ptr     = ctx->ptr;
-   size_t   vptr    = ctx->ptr>>2;
-   unsigned bit_len = ( (unsigned)ptr << 3 );  // one lane
-   uint32_t th      = ctx->T1;
-   uint32_t tl      = ctx->T0 + bit_len;
-
-   if ( ptr == 0 )
-   {
-        ctx->T0 = 0xFFFFFE00UL;
-        ctx->T1 = 0xFFFFFFFFUL;
-   }
-   else if ( ctx->T0 == 0 )
-   {
-      ctx->T0 = 0xFFFFFE00UL + bit_len;
-      ctx->T1 -= 1;
-   }
-   else
-      ctx->T0 -= 512 - bit_len;
-
-   // memset doesn't do ints
-   buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
-
-   if ( vptr < 5 )
-   {
-       memset_zero_256( buf + vptr + 1, 6 - vptr  );
-       buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
-             0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) ); 
-       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
-			                            th,th,th,th ) );
-       blake32x2_4way( ctx, buf + vptr, 64 - ptr );
-   }
-   else
-   {
-       memset_zero_256( vbuf + vptr + 1, 7 - vptr );
-       blake32x2_4way( ctx,  vbuf + ptr, 64 - ptr );
-       ctx->T0 = 0xFFFFFE00UL;
-       ctx->T1 = 0xFFFFFFFFUL;
-       buf[ 6 ] = mm256_zero;
-       buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
-		         0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
-       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
-                                                    th, th, th, th );
-       blake32x2_4way( ctx, buf, 64 );
-   }
-
-   casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
-   casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
-   casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
-   casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
-}
-
-
-
-
-#define DECL_STATE32x2_4WAY \
-   __m256i H0, H1, H2, H3; \
-   __m256i S0, S1; \
-   uint32_t T0, T1;
-
-#define READ_STATE32x2_4WAY(state)  do \
-{ \
-   H0 = casti_m256i( state->H, 0 ); \
-   H1 = casti_m256i( state->H, 1 ); \
-   H2 = casti_m256i( state->H, 2 ); \
-   H3 = casti_m256i( state->H, 3 ); \
-   S0 = casti_m256i( state->S, 0 ); \
-   S1 = casti_m256i( state->S, 1 ); \
-   T0 = state->T0; \
-   T1 = state->T1; \
-
-#define WRITE_STATE32x2_4WAY(state)   do { \
-   casti_m256i( state->H, 0 ) = H0; \
-   casti_m256i( state->H, 1 ) = H1; \
-   casti_m256i( state->H, 2 ) = H2; \
-   casti_m256i( state->H, 3 ) = H3; \
-   casti_m256i( state->S, 0 ) = S0; \
-   casti_m256i( state->S, 1 ) = S1; \
-   state->T0 = T0; \
-   state->T1 = T1; \
-} while (0)
-
-
-#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
-{ \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-          _mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
-	  _mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
-   d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-          _mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
-	  _mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
-} while (0)
-
-#define ROUND_Sx2_4WAY(r)   do \
-{ \
-  GS2_4WAY( Mx(r, 0),  Mx(r, 1),  Mx(r, 2),  Mx(r, 3), \
-           CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
-  GS2_4WAY( Mx(r, 4),  Mx(r, 5),  Mx(r, 6),  Mx(r, 7), \
-           CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
-  mm256_ror1x128_512( V3, V2 ); \
-  mm256_ror1x128_512( V6, V7 ); \
-  GS2_4WAY( Mx(r, 8),  Mx(r, 9),  Mx(r, A),  Mx(r, B), \
-           CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
-  GS2_4WAY( Mx(r, C),  Mx(r, D),  Mx(r, C),  Mx(r, D), \
-           CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
-  mm256_rol1x128_512( V2, V3 ); \
-  mm256_rol1x128_512( V7, V6 ); 
-
-#define COMPRESS32x2_4WAY( rounds ) do \
-{ \
-   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
-   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-   unsigned r; \
-   V0 = H0; \
-   V1 = H1; \
-   V2 = H2; \
-   V3 = H3; \
-   V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
-			                        CS0, CS0, CS0, CS0 ) ); \
-   V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
-                                                CS2, CS2, CS2, CS2 ) ); \
-   V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
-                              _mm256_set_epi32( CS5, CS5, CS5, CS5, \
-		                                CS4, CS4, CS4, CS4 ) ); \
-   V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
-                              _mm256_set_epi32( CS7, CS7, CS7, CS7, \
-                                                CS6, CS6, CS6, CS6 ) ); \
-   M0 = mm256_bswap_32( buf[ 0] ); \
-   M1 = mm256_bswap_32( buf[ 1] ); \
-   M2 = mm256_bswap_32( buf[ 2] ); \
-   M3 = mm256_bswap_32( buf[ 3] ); \
-   M4 = mm256_bswap_32( buf[ 4] ); \
-   M5 = mm256_bswap_32( buf[ 5] ); \
-   M6 = mm256_bswap_32( buf[ 6] ); \
-   M7 = mm256_bswap_32( buf[ 7] ); \
-   ROUND_Sx2_4WAY(0); \
-   ROUND_Sx2_4WAY(1); \
-   ROUND_Sx2_4WAY(2); \
-   ROUND_Sx2_4WAY(3); \
-   ROUND_Sx2_4WAY(4); \
-   ROUND_Sx2_4WAY(5); \
-   ROUND_Sx2_4WAY(6); \
-   ROUND_Sx2_4WAY(7); \
-   if (rounds == 14) \
-   { \
-      ROUND_Sx2_4WAY(8); \
-      ROUND_Sx2_4WAY(9); \
-      ROUND_Sx2_4WAY(0); \
-      ROUND_Sx2_4WAY(1); \
-      ROUND_Sx2_4WAY(2); \
-      ROUND_Sx2_4WAY(3); \
-   } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
-} while (0)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -0,0 +1,67 @@
+/**
+ * Blake2-B Implementation
+ * tpruvot@github 2015-2016
+ */
+
+#include "blake2b-gate.h"
+
+#if defined(BLAKE2B_4WAY)
+
+#include <string.h>
+#include <stdint.h>
+#include "blake2b-hash-4way.h"
+
+// Function not used, code inlined.
+void blake2b_4way_hash(void *output, const void *input)
+{
+    blake2b_4way_ctx ctx;
+    blake2b_4way_init( &ctx );
+    blake2b_4way_update( &ctx, input, 80 );
+    blake2b_4way_final( &ctx, output );
+}
+
+int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+	uint32_t hash[8*4] __attribute__ ((aligned (64)));;
+   uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+
+	do {
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+      blake2b_4way_init( &ctx ); 
+      blake2b_4way_update( &ctx, vdata, 80 );
+      blake2b_4way_final( &ctx, hash );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( hash7[ lane<<1 ] < Htarg )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 4;
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
--- a/algo/blake/blake2b-gate.c
+++ b/algo/blake/blake2b-gate.c
@@ -0,0 +1,25 @@
+#include "blake2b-gate.h"
+
+/*
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+*/
+
+bool register_blake2b_algo( algo_gate_t* gate )
+{
+#if defined(BLAKE2B_4WAY)
+  gate->scanhash  = (void*)&scanhash_blake2b_4way;
+  gate->hash      = (void*)&blake2b_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_blake2b;
+  gate->hash      = (void*)&blake2b_hash;
+#endif
+//  gate->get_max64 = (void*)&blake2s_get_max64;
+  gate->optimizations =  AVX2_OPT;
+  return true;
+};
+
+
--- a/algo/blake/blake2b-gate.h
+++ b/algo/blake/blake2b-gate.h
@@ -0,0 +1,26 @@
+#ifndef __BLAKE2B_GATE_H__
+#define __BLAKE2B_GATE_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX2__)
+  #define BLAKE2B_4WAY
+#endif
+
+bool register_blake2b_algo( algo_gate_t* gate );
+
+#if defined(BLAKE2B_4WAY)
+
+void blake2b_4way_hash( void *state, const void *input );
+int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+#else
+
+void blake2b_hash( void *state, const void *input );
+int scanhash_blake2b( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+#endif
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "blake2b-hash-4way.h"
+
+#if defined(__AVX2__)
+
+// G Mixing function.
+
+#define B2B_G(a, b, c, d, x, y) \
+{ \
+   v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
+	v[c] = _mm256_add_epi64( v[c], v[d] ); \
+	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
+	v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
+	v[c] = _mm256_add_epi64( v[c], v[d] ); \
+	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
+}
+
+// Initialization Vector.
+/*
+static const uint64_t blake2b_iv[8] = {
+	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+};
+*/
+
+static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+{
+	const uint8_t sigma[12][16] = {
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+	};
+	int i;
+	__m256i v[16], m[16];
+
+   v[ 0] = ctx->h[0];
+   v[ 1] = ctx->h[1];
+   v[ 2] = ctx->h[2];
+   v[ 3] = ctx->h[3];
+   v[ 4] = ctx->h[4];
+   v[ 5] = ctx->h[5];
+   v[ 6] = ctx->h[6];
+   v[ 7] = ctx->h[7];
+   v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
+   v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
+   v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
+   v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   v[12] = m256_const1_64( 0x510E527FADE682D1 );
+   v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
+   v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
+   v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
+
+   if ( last )   
+		v[14] = mm256_not( v[14] );
+
+   m[ 0] = ctx->b[ 0];
+   m[ 1] = ctx->b[ 1];
+   m[ 2] = ctx->b[ 2];
+   m[ 3] = ctx->b[ 3];
+   m[ 4] = ctx->b[ 4];
+   m[ 5] = ctx->b[ 5];
+   m[ 6] = ctx->b[ 6];
+   m[ 7] = ctx->b[ 7];
+   m[ 8] = ctx->b[ 8];
+   m[ 9] = ctx->b[ 9];
+   m[10] = ctx->b[10];
+   m[11] = ctx->b[11];
+   m[12] = ctx->b[12];
+   m[13] = ctx->b[13];
+   m[14] = ctx->b[14];
+   m[15] = ctx->b[15];
+   
+	for ( i = 0; i < 12; i++ )
+   { 
+		B2B_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
+		B2B_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
+		B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
+		B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
+		B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
+		B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
+		B2B_G( 2, 7,  8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
+		B2B_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
+	}
+
+   ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] );
+   ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] );
+   ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] );
+   ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] );
+   ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] );
+   ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] );
+   ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] );
+   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
+}
+
+int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+{
+	size_t i;
+
+   ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
+   ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
+   ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
+   ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
+   ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
+   ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
+
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->c = 0;
+	ctx->outlen = 32;
+
+   for ( i = 0; i < 16; i++ )
+     ctx->b[i] = m256_zero;
+
+	return 0;
+}
+
+void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+                          size_t inlen ) 
+{
+   __m256i* in =(__m256i*)input;
+
+	size_t i, c;
+   c = ctx->c >> 3; 
+
+	for ( i = 0; i < (inlen >> 3); i++ )
+   {
+		if ( ctx->c == 128 )
+      { 
+			ctx->t[0] += ctx->c;
+			if ( ctx->t[0] < ctx->c )
+				ctx->t[1]++;
+			blake2b_4way_compress( ctx, 0 );
+			ctx->c = 0;
+		}
+      ctx->b[ c++ ] = in[i];
+      ctx->c += 8;
+   }
+}
+
+void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+{
+	size_t c;
+   c = ctx->c >> 3;
+
+	ctx->t[0] += ctx->c;
+	if ( ctx->t[0] < ctx->c )
+		ctx->t[1]++;
+
+	while ( ctx->c < 128 )
+   {
+      ctx->b[c++] = m256_zero;
+      ctx->c += 8;
+   }
+
+   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+
+   casti_m256i( out, 0 ) = ctx->h[0];
+   casti_m256i( out, 1 ) = ctx->h[1];
+   casti_m256i( out, 2 ) = ctx->h[2];
+   casti_m256i( out, 3 ) = ctx->h[3];
+}
+
+#endif
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -0,0 +1,35 @@
+#pragma once
+#ifndef __BLAKE2B_HASH_4WAY_H__
+#define __BLAKE2B_HASH_4WAY_H__
+
+#if defined(__AVX2__)
+
+#include "simd-utils.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+// state context
+ALIGN(64) typedef struct {
+	__m256i b[16]; // input buffer
+	__m256i h[8];  // chained state
+	uint64_t t[2];  // total number of bytes
+	size_t c;       // pointer for b[]
+	size_t outlen;  // digest size
+} blake2b_4way_ctx __attribute__((aligned(64)));
+
+int blake2b_4way_init( blake2b_4way_ctx *ctx );
+void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+                          size_t inlen );
+void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+
+#endif
+
+#endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,13 +3,11 @@
 * tpruvot@github 2015-2016
 */

-#include "algo-gate-api.h"
+#include "blake2b-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake2b.h"

-//static __thread sph_blake2b_ctx s_midstate;
-//static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64

@@ -25,16 +23,6 @@ void blake2b_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-/*
-static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
-{
-	s_ctx.outlen = MIDLEN;
-	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
-	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
-	sph_blake2b_final(&s_ctx, (uint8_t*) output);
-}
-*/
-
 int scanhash_blake2b( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -45,7 +33,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[8];
+	const uint32_t first_nonce = pdata[19];

 	uint32_t n = first_nonce;

@@ -53,179 +41,23 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-	// midstate (untested yet)
-	//blake2b_init(&s_midstate, 32, NULL, 0);
-	//blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
-	//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));
-
 	do {
-		be32enc(&endiandata[8], n);
+		be32enc(&endiandata[19], n);
 		//blake2b_hash_end(vhashcpu, endiandata);
 		blake2b_hash(vhashcpu, endiandata);

 		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
 			work_set_target_ratio(work, vhashcpu);
 			*hashes_done = n - first_nonce + 1;
-			pdata[8] = n;
+			pdata[19] = n;
 			return 1;
 		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
-	pdata[8] = n;
+	pdata[19] = n;

 	return 0;
 }

-static inline void swab256(void *dest_p, const void *src_p)
-{
-	uint32_t *dest = (uint32_t *)dest_p;
-	const uint32_t *src = (uint32_t *)src_p;
-
-	dest[0] = swab32(src[7]);
-	dest[1] = swab32(src[6]);
-	dest[2] = swab32(src[5]);
-	dest[3] = swab32(src[4]);
-	dest[4] = swab32(src[3]);
-	dest[5] = swab32(src[2]);
-	dest[6] = swab32(src[1]);
-	dest[7] = swab32(src[0]);
-}
-
-/* compute nbits to get the network diff */
-void blake2b_calc_network_diff(struct work *work)
-{
-        // sample for diff 43.281 : 1c05ea29
-        uint32_t nbits = work->data[11]; // unsure if correct
-        uint32_t bits = (nbits & 0xffffff);
-        int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-
-        double d = (double)0x0000ffff / (double)bits;
-        for (int m=shift; m < 29; m++) d *= 256.0;
-        for (int m=29; m < shift; m++) d /= 256.0;
-        if (opt_debug_diff)
-                applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-        net_diff = d;
-}
-
-void blake2b_be_build_stratum_request( char *req, struct work *work )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime,       nonce;
-   char     ntimestr[9], noncestr[9];
-   be32enc( &ntime, work->data[ algo_gate.ntime_index ] );
-   be32enc( &nonce, work->data[ algo_gate.nonce_index ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   uint16_t high_nonce = swab32(work->data[9]) >> 16;
-   xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2);
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free( xnonce2str );
-}
-
-#define min(a,b) (a>b ? (b) :(a))
-
-// merkle root handled here, no need for gen_merkle_root gate target
-void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-    uchar merkle_root[64] = { 0 };
-    uint32_t extraheader[32] = { 0 };
-    int headersize = 0;
-    size_t t;
-    int i;
-
-    // merkle root
-    memcpy( merkle_root, sctx->job.coinbase, 32 );
-    headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) );
-    memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-    // Increment extranonce2 
-    for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-    // Assemble block header 
-    memset( g_work->data, 0, sizeof(g_work->data) );
-//    g_work->data[0] = le32dec( sctx->job.version );
-//    for ( i = 0; i < 8; i++ )
-//       g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
-    for ( i = 0; i < 8; i++ )
-       g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
-//    for ( i = 0; i < 8; i++ )
-//       g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
-    g_work->data[8]  = 0; // nonce
-    g_work->data[9]  = swab32( extraheader[0] ) | ( rand() & 0xf0 );
-    g_work->data[10] = be32dec( sctx->job.ntime );
-    g_work->data[11] = be32dec( sctx->job.nbits );
-    for ( i = 0; i < 8; i++ )
-       g_work->data[12+i] = ( (uint32_t*)merkle_root )[i];
-}
-
-#undef min
-
-void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                           uint32_t* end_nonce_ptr, bool clean_job )
-{
-   const int wkcmp_sz = 32;  // bytes
-   const int wkcmp_off = 32 + 16; 
-   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
-
-   if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) 
-      || strcmp( work->job_id, g_work->job_id ) ) )
-   {
-      work_free( work );
-      work_copy( work, g_work );
-      *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
-      if ( opt_randomize )
-         *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
-      *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
-   }
-   else
-       ++(*nonceptr);
-
-   // suprnova job_id check without data/target/height change...
-   // we just may have copied new g_wwork to work so why this test here?
-//   if (  have_stratum && strcmp( work->job_id, g_work->job_id ) )
-      // exit thread loop
-//      continue;
-//   else
-//   {
-//      nonceptr[1] += 0x10;
-//      nonceptr[1] |= thr_id;
-//   }
-}
-
-bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) )
-      // need to regen g_work..
-      return false;
-   // extradata: prevent duplicates
-   work->data[ 8     ] += 0x10;
-   work->data[ 8 + 1 ] |= thr_id;
-   return true;
-}
-
-double blake2b_get_max64() { return 0x1fffffLL; }
-
-bool register_blake2b_algo( algo_gate_t* gate )
-{
-  algo_not_tested();
-  gate->ntime_index   = 10;
-  gate->nbits_index   = 11;
-  gate->nonce_index   =  8;
-  gate->work_cmp_size = 32;
-  gate->scanhash              = (void*)&scanhash_blake2b;
-  gate->hash                  = (void*)&blake2b_hash;
-  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
-  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
-  gate->get_new_work          = (void*)&blake2b_get_new_work;
-  gate->get_max64             = (void*)&blake2b_get_max64;
-  gate->ready_to_mine         = (void*)&blake2b_ready_to_mine;
-  have_gbt = false;
-  return true;
-}
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -20,7 +20,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->hash      = (void*)&blake2s_hash;
 #endif
  gate->get_max64 = (void*)&blake2s_get_max64;
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -4,7 +4,8 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
 #if defined(__AVX2__)
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -17,7 +17,9 @@
 #include <string.h>
 #include <stdio.h>

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)
+

 static const uint32_t blake2s_IV[8] =
 {
@@ -57,8 +59,18 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_4way_state ) );
-   for( int i = 0; i < 8; ++i )
-      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
+
+   S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
+   
+//   for( int i = 0; i < 8; ++i )
+//      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );

   uint32_t *p = ( uint32_t * )( P );

@@ -267,8 +279,18 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_8way_state ) );
-   for( int i = 0; i < 8; ++i )
-      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
+   S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
+
+
+//   for( int i = 0; i < 8; ++i )
+//      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );

   uint32_t *p = ( uint32_t * )( P );

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -14,7 +14,8 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)

 #include "simd-utils.h"

--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -307,12 +307,12 @@ static const sph_u64 CB[16] = {

 #define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
+                 _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
+                 _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
@@ -479,20 +479,20 @@ static const sph_u64 CB[16] = {
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) );  \
-  V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) );  \
-  VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) );  \
-  VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) );  \
+  V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) );  \
+  V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) );  \
+  VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) );  \
+  VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) );  \
  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         _mm256_set1_epi64x( CB4 ) );  \
+                         m256_const1_64( CB4 ) );  \
  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         _mm256_set1_epi64x( CB5 ) );  \
+                         m256_const1_64( CB5 ) );  \
  VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         _mm256_set1_epi64x( CB6 ) );  \
+                         m256_const1_64( CB6 ) );  \
  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         _mm256_set1_epi64x( CB7 ) );  \
-  shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+                         m256_const1_64( CB7 ) );  \
+  shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -544,14 +544,14 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
              const sph_u64 *salt )
 {
   __m256i zero = m256_zero;
-   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] );
-   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] );
-   casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] );
-   casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] );
-   casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] );
-   casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] );
-   casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] );
-   casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] );
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );

   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
@@ -642,11 +642,9 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
-                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+                                 m256_const1_64( 0x0100000000000000ULL ) );
+       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
@@ -659,11 +657,9 @@ blake64_4way_close( blake_4way_big_context *sc,
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+           buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
+       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf, 128 );
   }
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,7 +103,6 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 	v[13] ^= ctx->t[1];                 // high 64 bits
 	if (last)                           // last block flag set ?
 		v[14] = ~v[14];
-
 	for (i = 0; i < 16; i++)            // get little-endian words
 		m[i] = B2B_GET64(&ctx->b[8 * i]);

@@ -184,7 +183,8 @@ void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out )

 	while (ctx->c < 128)                // fill up with zeros
 		ctx->b[ctx->c++] = 0;
-	blake2b_compress(ctx, 1);           // final block flag = 1
+
+   blake2b_compress(ctx, 1);           // final block flag = 1

 	// little endian convert and store
 	for (i = 0; i < ctx->outlen; i++) {