v3.7.5

2025-09-17 23:44:27 +00:00 · 2017-12-08 15:39:28 -05:00
parent 4b57ac0eb9
commit af1c940919
53 changed files with 1324 additions and 4790 deletions
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -9,18 +9,18 @@

 void blakehash_4way(void *state, const void *input)
 {
-     uint32_t hash0[16] __attribute__ ((aligned (64)));
-     uint32_t hash1[16] __attribute__ ((aligned (64)));
-     uint32_t hash2[16] __attribute__ ((aligned (64)));
-     uint32_t hash3[16] __attribute__ ((aligned (64)));
-     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
+     uint32_t hash0[4] __attribute__ ((aligned (32)));
+     uint32_t hash1[4] __attribute__ ((aligned (32)));
+     uint32_t hash2[4] __attribute__ ((aligned (32)));
+     uint32_t hash3[4] __attribute__ ((aligned (32)));
     blake256_4way_context ctx;

     blake256_4way_init( &ctx );
     blake256_4way( &ctx, input, 16 );
     blake256_4way_close( &ctx, vhash );

-     m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
@@ -32,7 +32,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t hash[4*4] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -49,7 +49,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   // we need big endian data...
   swab32_array( endiandata, pdata, 20 );

-   m128_interleave_4x32( vdata, endiandata, endiandata, endiandata,
+   mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
                         endiandata, 640 );

   uint32_t *noncep = vdata + 76;   // 19*4
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -13,11 +13,12 @@ bool register_blake_algo( algo_gate_t* gate )
 //  gate->scanhash  = (void*)&scanhash_blake_8way;
 //  gate->hash      = (void*)&blakehash_8way;
 #if defined(BLAKE_4WAY)
-  gate->optimizations = SSE2_OPT | AVX_OPT;
+  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
+  four_way_not_tested();
 #else
-  gate->optimizations = SSE2_OPT;
  gate->scanhash  = (void*)&scanhash_blake;
  gate->hash      = (void*)&blakehash;
 #endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -536,22 +536,22 @@ do { \
                          , _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
                            _mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm_byteswap_epi32( *(buf +  0) ); \
-	M[0x1] = mm_byteswap_epi32( *(buf +  1) ); \
-	M[0x2] = mm_byteswap_epi32( *(buf +  2) ); \
-	M[0x3] = mm_byteswap_epi32( *(buf +  3) ); \
-	M[0x4] = mm_byteswap_epi32( *(buf +  4) ); \
-	M[0x5] = mm_byteswap_epi32( *(buf +  5) ); \
-	M[0x6] = mm_byteswap_epi32( *(buf +  6) ); \
-	M[0x7] = mm_byteswap_epi32( *(buf +  7) ); \
-	M[0x8] = mm_byteswap_epi32( *(buf +  8) ); \
-	M[0x9] = mm_byteswap_epi32( *(buf +  9) ); \
-	M[0xA] = mm_byteswap_epi32( *(buf + 10) ); \
-	M[0xB] = mm_byteswap_epi32( *(buf + 11) ); \
-	M[0xC] = mm_byteswap_epi32( *(buf + 12) ); \
-	M[0xD] = mm_byteswap_epi32( *(buf + 13) ); \
-	M[0xE] = mm_byteswap_epi32( *(buf + 14) ); \
-	M[0xF] = mm_byteswap_epi32( *(buf + 15) ); \
+	M[0x0] = mm_byteswap_32( *(buf +  0) ); \
+	M[0x1] = mm_byteswap_32( *(buf +  1) ); \
+	M[0x2] = mm_byteswap_32( *(buf +  2) ); \
+	M[0x3] = mm_byteswap_32( *(buf +  3) ); \
+	M[0x4] = mm_byteswap_32( *(buf +  4) ); \
+	M[0x5] = mm_byteswap_32( *(buf +  5) ); \
+	M[0x6] = mm_byteswap_32( *(buf +  6) ); \
+	M[0x7] = mm_byteswap_32( *(buf +  7) ); \
+	M[0x8] = mm_byteswap_32( *(buf +  8) ); \
+	M[0x9] = mm_byteswap_32( *(buf +  9) ); \
+	M[0xA] = mm_byteswap_32( *(buf + 10) ); \
+	M[0xB] = mm_byteswap_32( *(buf + 11) ); \
+	M[0xC] = mm_byteswap_32( *(buf + 12) ); \
+	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
+	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
+	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
 	for (r = 0; r < BLAKE32_ROUNDS; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -601,22 +601,22 @@ do { \
                            _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M0 = mm_byteswap_epi32( * buf ); \
-	M1 = mm_byteswap_epi32( *(buf+1) ); \
-	M2 = mm_byteswap_epi32( *(buf+2) ); \
-	M3 = mm_byteswap_epi32( *(buf+3) ); \
-	M4 = mm_byteswap_epi32( *(buf+4) ); \
-	M5 = mm_byteswap_epi32( *(buf+5) ); \
-	M6 = mm_byteswap_epi32( *(buf+6) ); \
-	M7 = mm_byteswap_epi32( *(buf+7) ); \
-	M8 = mm_byteswap_epi32( *(buf+8) ); \
-	M9 = mm_byteswap_epi32( *(buf+9) ); \
-	MA = mm_byteswap_epi32( *(buf+10) ); \
-	MB = mm_byteswap_epi32( *(buf+11) ); \
-	MC = mm_byteswap_epi32( *(buf+12) ); \
-	MD = mm_byteswap_epi32( *(buf+13) ); \
-	ME = mm_byteswap_epi32( *(buf+14) ); \
-	MF = mm_byteswap_epi32( *(buf+15) ); \
+	M0 = mm_byteswap_32( * buf ); \
+	M1 = mm_byteswap_32( *(buf+1) ); \
+	M2 = mm_byteswap_32( *(buf+2) ); \
+	M3 = mm_byteswap_32( *(buf+3) ); \
+	M4 = mm_byteswap_32( *(buf+4) ); \
+	M5 = mm_byteswap_32( *(buf+5) ); \
+	M6 = mm_byteswap_32( *(buf+6) ); \
+	M7 = mm_byteswap_32( *(buf+7) ); \
+	M8 = mm_byteswap_32( *(buf+8) ); \
+	M9 = mm_byteswap_32( *(buf+9) ); \
+	MA = mm_byteswap_32( *(buf+10) ); \
+	MB = mm_byteswap_32( *(buf+11) ); \
+	MC = mm_byteswap_32( *(buf+12) ); \
+	MD = mm_byteswap_32( *(buf+13) ); \
+	ME = mm_byteswap_32( *(buf+14) ); \
+	MF = mm_byteswap_32( *(buf+15) ); \
 	ROUND_S_4WAY(0); \
 	ROUND_S_4WAY(1); \
 	ROUND_S_4WAY(2); \
@@ -722,22 +722,22 @@ do { \
                               _mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
        VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
                              _mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
-	M[0x0] = mm256_byteswap_epi64( *(buf+0) ); \
-	M[0x1] = mm256_byteswap_epi64( *(buf+1) ); \
-	M[0x2] = mm256_byteswap_epi64( *(buf+2) ); \
-	M[0x3] = mm256_byteswap_epi64( *(buf+3) ); \
-	M[0x4] = mm256_byteswap_epi64( *(buf+4) ); \
-	M[0x5] = mm256_byteswap_epi64( *(buf+5) ); \
-	M[0x6] = mm256_byteswap_epi64( *(buf+6) ); \
-	M[0x7] = mm256_byteswap_epi64( *(buf+7) ); \
-	M[0x8] = mm256_byteswap_epi64( *(buf+8) ); \
-	M[0x9] = mm256_byteswap_epi64( *(buf+9) ); \
-	M[0xA] = mm256_byteswap_epi64( *(buf+10) ); \
-	M[0xB] = mm256_byteswap_epi64( *(buf+11) ); \
-	M[0xC] = mm256_byteswap_epi64( *(buf+12) ); \
-	M[0xD] = mm256_byteswap_epi64( *(buf+13) ); \
-	M[0xE] = mm256_byteswap_epi64( *(buf+14) ); \
-	M[0xF] = mm256_byteswap_epi64( *(buf+15) ); \
+	M[0x0] = mm256_byteswap_64( *(buf+0) ); \
+	M[0x1] = mm256_byteswap_64( *(buf+1) ); \
+	M[0x2] = mm256_byteswap_64( *(buf+2) ); \
+	M[0x3] = mm256_byteswap_64( *(buf+3) ); \
+	M[0x4] = mm256_byteswap_64( *(buf+4) ); \
+	M[0x5] = mm256_byteswap_64( *(buf+5) ); \
+	M[0x6] = mm256_byteswap_64( *(buf+6) ); \
+	M[0x7] = mm256_byteswap_64( *(buf+7) ); \
+	M[0x8] = mm256_byteswap_64( *(buf+8) ); \
+	M[0x9] = mm256_byteswap_64( *(buf+9) ); \
+	M[0xA] = mm256_byteswap_64( *(buf+10) ); \
+	M[0xB] = mm256_byteswap_64( *(buf+11) ); \
+	M[0xC] = mm256_byteswap_64( *(buf+12) ); \
+	M[0xD] = mm256_byteswap_64( *(buf+13) ); \
+	M[0xE] = mm256_byteswap_64( *(buf+14) ); \
+	M[0xF] = mm256_byteswap_64( *(buf+15) ); \
 	for (r = 0; r < 16; r ++) \
 		ROUND_B_4WAY(r); \
        H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -787,22 +787,22 @@ do { \
                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_byteswap_epi64( *(buf + 0) ); \
-     M1 = mm256_byteswap_epi64( *(buf + 1) ); \
-     M2 = mm256_byteswap_epi64( *(buf + 2) ); \
-     M3 = mm256_byteswap_epi64( *(buf + 3) ); \
-     M4 = mm256_byteswap_epi64( *(buf + 4) ); \
-     M5 = mm256_byteswap_epi64( *(buf + 5) ); \
-     M6 = mm256_byteswap_epi64( *(buf + 6) ); \
-     M7 = mm256_byteswap_epi64( *(buf + 7) ); \
-     M8 = mm256_byteswap_epi64( *(buf + 8) ); \
-     M9 = mm256_byteswap_epi64( *(buf + 9) ); \
-     MA = mm256_byteswap_epi64( *(buf + 10) ); \
-     MB = mm256_byteswap_epi64( *(buf + 11) ); \
-     MC = mm256_byteswap_epi64( *(buf + 12) ); \
-     MD = mm256_byteswap_epi64( *(buf + 13) ); \
-     ME = mm256_byteswap_epi64( *(buf + 14) ); \
-     MF = mm256_byteswap_epi64( *(buf + 15) ); \
+     M0 = mm256_byteswap_64( *(buf + 0) ); \
+     M1 = mm256_byteswap_64( *(buf + 1) ); \
+     M2 = mm256_byteswap_64( *(buf + 2) ); \
+     M3 = mm256_byteswap_64( *(buf + 3) ); \
+     M4 = mm256_byteswap_64( *(buf + 4) ); \
+     M5 = mm256_byteswap_64( *(buf + 5) ); \
+     M6 = mm256_byteswap_64( *(buf + 6) ); \
+     M7 = mm256_byteswap_64( *(buf + 7) ); \
+     M8 = mm256_byteswap_64( *(buf + 8) ); \
+     M9 = mm256_byteswap_64( *(buf + 9) ); \
+     MA = mm256_byteswap_64( *(buf + 10) ); \
+     MB = mm256_byteswap_64( *(buf + 11) ); \
+     MC = mm256_byteswap_64( *(buf + 12) ); \
+     MD = mm256_byteswap_64( *(buf + 13) ); \
+     ME = mm256_byteswap_64( *(buf + 14) ); \
+     MF = mm256_byteswap_64( *(buf + 15) ); \
     ROUND_B_4WAY(0); \
     ROUND_B_4WAY(1); \
     ROUND_B_4WAY(2); \
@@ -870,7 +870,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )

 	if ( len < buf_size - ptr )
        {
-		memcpy_m128i( buf + (ptr>>2), vdata, len>>2 );
+		memcpy_128( buf + (ptr>>2), vdata, len>>2 );
 		ptr += len;
 		sc->ptr = ptr;
 		return;
@@ -884,7 +884,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
 	   clen = buf_size - ptr;
 	   if (clen > len)
 		clen = len;
-	   memcpy_m128i( buf + (ptr>>2), vdata, clen>>2 );
+	   memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
 	   ptr += clen;
           vdata += (clen>>2);
 	   len -= clen;
@@ -936,32 +936,32 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   if ( ptr <= 48 )
   {
-       memset_zero_m128i( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
+       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
                                    _mm_set_epi32( 0x010000000, 0x01000000,
                                                   0x010000000, 0x01000000 ) );
-       *(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
+       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
+       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-	memset_zero_m128i( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+	memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
 	blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
 	sc->T0 = SPH_C32(0xFFFFFE00);
 	sc->T1 = SPH_C32(0xFFFFFFFF);
-	memset_zero_m128i( u.buf, 56>>2 );
+	memset_zero_128( u.buf, 56>>2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
                                         0x010000000, 0x01000000 );
-        *(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
+        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
+        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
 	blake32_4way( sc, u.buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm_byteswap_epi32( sc->H[k] );
+        out[k] = mm_byteswap_32( sc->H[k] );
 }

 #if defined (__AVX2__)
@@ -995,7 +995,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-	memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
 	ptr += len;
 	sc->ptr = ptr;
 	return;
@@ -1009,7 +1009,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 	clen = buf_size - ptr;
 	if ( clen > len )
 		clen = len;
-	memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
 	ptr += clen;
 	vdata = vdata + (clen>>3);
 	len -= clen;
@@ -1062,44 +1062,44 @@ blake64_4way_close( blake_4way_big_context *sc,
   }
   if ( ptr <= 104 )
   {
-       memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
+       memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
                                    _mm256_set_epi64x( 0x0100000000000000,
                                                       0x0100000000000000,
                                                       0x0100000000000000,
                                                       0x0100000000000000 ) );
-       *(u.buf+(112>>3)) = mm256_byteswap_epi64(
+       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_epi64(
+       *(u.buf+(120>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
   }
   else
  {
-       memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+       memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
-       memset_zero_m256i( u.buf, 112>>3 ); 
+       memset_zero_256( u.buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
           u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
                                              0x0100000000000000,
                                              0x0100000000000000,
                                              0x0100000000000000 );

-       *(u.buf+(112>>3)) = mm256_byteswap_epi64(
+       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_epi64(
+       *(u.buf+(120>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

       blake64_4way( sc, u.buf, 128 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_byteswap_epi64( sc->H[k] );
+       out[k] = mm256_byteswap_64( sc->H[k] );
 }

 #endif
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -13,17 +13,17 @@ static __thread bool ctx_midstate_done = false;

 void decred_hash_4way( void *state, const void *input )
 {
-     uint32_t hash0[16] __attribute__ ((aligned (64)));
-     uint32_t hash1[16] __attribute__ ((aligned (64)));
-     uint32_t hash2[16] __attribute__ ((aligned (64)));
-     uint32_t hash3[16] __attribute__ ((aligned (64)));
-     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
+     uint32_t hash0[4] __attribute__ ((aligned (32)));
+     uint32_t hash1[4] __attribute__ ((aligned (32)));
+     uint32_t hash2[4] __attribute__ ((aligned (32)));
+     uint32_t hash3[4] __attribute__ ((aligned (32)));
     blake256_4way_context ctx __attribute__ ((aligned (64)));

     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
     uint32_t hash[16] __attribute__ ((aligned (64)));
     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
-     m128_deinterleave_4x32( sin0, sin1, sin2, sin3, (uint32_t*)input, 180*8 );
+     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );

     void *tail = input + DECRED_MIDSTATE_LEN;
     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
@@ -53,7 +53,7 @@ void decred_hash_4way( void *state, const void *input )
     blake256_4way( &ctx, input, 180 );
     blake256_4way_close( &ctx, vhash );

-     m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
 /*
        for ( int i = 0; i < 8; i++ )
          if ( hash[i] != hash0[i] )
@@ -79,7 +79,7 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done)
 {
   uint32_t vdata[45*4] __attribute__ ((aligned (64)));
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t hash[4*4] __attribute__ ((aligned (32)));
        uint32_t _ALIGN(64) endiandata[48];
 //        uint32_t _ALIGN(64) hash32[8];
        uint32_t *pdata = work->data;
@@ -97,7 +97,8 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,

 //        memcpy(endiandata, pdata, 180);

-   m128_interleave_4x32( vdata, pdata, pdata, pdata, pdata, 180*8 );
+   // use the old way until  new way updated for size.
+   mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -144,7 +144,8 @@ bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
 bool register_decred_algo( algo_gate_t* gate )
 {
 #if defined(DECRED_4WAY)
-  gate->optimizations = SSE2_OPT | AVX_OPT;
+  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_decred_4way;
  gate->hash      = (void*)&decred_hash_4way;
 #else
@@ -153,9 +154,6 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->hash      = (void*)&decred_hash;
 #endif

-//  gate->optimizations         = SSE2_OPT;
-//  gate->scanhash              = (void*)&scanhash_decred;
-//  gate->hash                  = (void*)&decred_hash;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -30,13 +30,13 @@ extern void pentablakehash_4way( void *output, const void *input )
     blake512_4way_close( &ctx, vhash );

 uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
-m256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
+mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
 sph_blake512_context ctx2_blake;
 sph_blake512_init(&ctx2_blake);
 sph_blake512(&ctx2_blake, sin0, 80);
 sph_blake512_close(&ctx2_blake, (void*) hash);

-m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 uint64_t* hash64 = (uint64_t*)hash;
 for( int i = 0; i < 8; i++ )
 {
@@ -60,7 +60,7 @@ for( int i = 0; i < 8; i++ )
     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

-     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
     memcpy( output+64, hash2, 32 );
@@ -141,7 +141,7 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    swab32_array( endiandata, pdata, 20 );

    uint64_t *edata = (uint64_t*)endiandata;
-    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

    for ( int m=0; m < 6; m++ )
    {
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -3,13 +3,13 @@
 bool register_pentablake_algo( algo_gate_t* gate )
 {
 #if defined (PENTABLAKE_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT;
    gate->scanhash  = (void*)&scanhash_pentablake_4way;
    gate->hash      = (void*)&pentablakehash_4way;
 #else
    gate->scanhash  = (void*)&scanhash_pentablake;
    gate->hash      = (void*)&pentablakehash;
 #endif
+    gate->optimizations = FOUR_WAY_OPT;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -95,13 +95,13 @@ extern "C"{
 #define Sb(x0, x1, x2, x3, c) \
 do { \
   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
-    x3 = mm256_bitnot( x3 ); \
-    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_bitnot( x2 ) ) ); \
+    x3 = mm256_not( x3 ); \
+    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
-    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_bitnot( x1 ), x2 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
-    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_bitnot( x3 ) ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
@@ -532,7 +532,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )

   if ( len < (buf_size - ptr) )
   {
-       memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+       memcpy_256( buf + (ptr>>3), vdata, len>>3 );
       ptr += len;
       sc->ptr = ptr;
       return;
@@ -546,7 +546,7 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
       if ( clen > len )
          clen = len;

-       memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+       memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
       ptr += clen;
       vdata += (clen>>3);
       len -= clen;
@@ -579,7 +579,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
   else
       numz = 112 - sc->ptr;

-   memset_zero_m256i( buf+1, (numz>>3) - 1 );   
+   memset_zero_256( buf+1, (numz>>3) - 1 );   

   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
   l1 = SPH_T64(sc->block_count >> 55);
@@ -593,7 +593,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
   for ( u=0; u < 8; u++ )
       buf[u] = sc->H[u+8];

-    memcpy_m256i( dst256, buf, 8 );
+    memcpy_256( dst256, buf, 8 );
 }

 void
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -1,11 +1,12 @@
-#if defined(JHA_4WAY)
-
 #include "jha-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "avxdefs.h"
+//#include "avxdefs.h"
+
+#if defined(JHA_4WAY)
+
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
@@ -15,19 +16,19 @@
 //static __thread keccak512_4way_context jha_kec_mid
 //                                   __attribute__ ((aligned (64)));

-void jha_hash_4way( void *output, const void *input )
+void jha_hash_4way( void *out, const void *input )
 {
    uint64_t hash0[8] __attribute__ ((aligned (64)));
    uint64_t hash1[8] __attribute__ ((aligned (64)));
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhasha[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhashb[8*4] __attribute__ ((aligned (64)));
-    __m256i mask;
-    __m256i* vh256 = (__m256i*)vhash;
-    __m256i* vha256 = (__m256i*)vhasha;
-    __m256i* vhb256 = (__m256i*)vhashb;
+    uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
+    __m256i mask0, mask1;
+    __m256i* vh = (__m256i*)vhash;
+    __m256i* vh0 = (__m256i*)vhash0;
+    __m256i* vh1 = (__m256i*)vhash1;

    blake512_4way_context  ctx_blake;
    hashState_groestl      ctx_groestl;
@@ -40,21 +41,29 @@ void jha_hash_4way( void *output, const void *input )
    keccak512_4way_close( &ctx_keccak, vhash );

 //    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
-//    keccak512_4way( &ctx_keccak, input+64, 16 );
+//    keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
 //    keccak512_4way_close( &ctx_keccak, vhash );

    // Heavy & Light Pair Loop
    for ( int round = 0; round < 3; round++ )
    {
-       memset_zero_m256i( vha256, 20 );
-       memset_zero_m256i( vhb256, 20 );
+//       memset_zero_256( vh0, 20 );
+//       memset_zero_256( vh1, 20 );

-       mask = _mm256_sub_epi64( _mm256_and_si256( vh256[0],
-                        mm256_vec_epi64( 0x1 ) ), mm256_vec_epi64( 0x1 ) );
+      // positive logic, if maski select vhi
+      // going from bit to mask reverses logic such that if the test bit is set
+      // zero will be put in mask0, meaning don't take vh0. mask1 is
+      // inverted so 1 will be put in mask1 meaning take it.
+      mask0 = mm256_negate_64(
+                     _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
+      mask1 = mm256_not( mask0 );
+
+//       mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
+//                     _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );

       // groestl (serial) v skein

-       m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
@@ -71,58 +80,66 @@ void jha_hash_4way( void *output, const void *input )
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
                                          (char*)hash3, 512 );

-       m256_interleave_4x64( vhasha, hash0, hash1, hash2, hash3, 512 );
+       mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );

       // skein

       skein512_4way_init( &ctx_skein );
       skein512_4way( &ctx_skein, vhash, 64 );
-       skein512_4way_close( &ctx_skein, vhashb );
+       skein512_4way_close( &ctx_skein, vhash1 );

       // merge vectored hash
       for ( int i = 0; i < 8; i++ )
       {
+          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
+                                   _mm256_and_si256( vh1[i], mask1 ) );
+/*
          vha256[i] = _mm256_maskload_epi64( 
-                                      vhasha + i*4, mm256_bitnot(mask ) );
+                                      vhasha + i*4, mm256_not( mask ) );
          vhb256[i] = _mm256_maskload_epi64(
                                      vhashb + i*4, mask );
          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+*/
       }

       // blake v jh

       blake512_4way_init( &ctx_blake );
       blake512_4way( &ctx_blake, vhash, 64 );
-       blake512_4way_close( &ctx_blake, vhasha );
+       blake512_4way_close( &ctx_blake, vhash0 );

       jh512_4way_init( &ctx_jh );
       jh512_4way( &ctx_jh, vhash, 64 );
-       jh512_4way_close( &ctx_jh, vhashb );
+       jh512_4way_close( &ctx_jh, vhash1 );

-       // merge vectored hash
+       // merge hash
       for ( int i = 0; i < 8; i++ )
       {
+          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
+                                   _mm256_and_si256( vh1[i], mask1 ) );
+/*
          vha256[i] = _mm256_maskload_epi64(
-                                      vhasha + i*4, mm256_bitnot(mask ) );
+                                      vhasha + i*4, mm256_not( mask ) );
          vhb256[i] = _mm256_maskload_epi64(
                                      vhashb + i*4, mask );
          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+*/
       }
    }

-    m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );

-    memcpy( output,       hash0, 32 );
-    memcpy( output+32,    hash1, 32 );
-    memcpy( output+64,    hash2, 32 );
-    memcpy( output+96,    hash3, 32 );
+//    memcpy( output,       hash0, 32 );
+//    memcpy( output+32,    hash1, 32 );
+//    memcpy( output+64,    hash2, 32 );
+//    memcpy( output+96,    hash3, 32 );

 }

 int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[8*4] __attribute__ ((aligned (64)));
     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
 	uint32_t *pdata = work->data;
@@ -160,7 +177,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( &endiandata[i], pdata[i] );

   uint64_t *edata = (uint64_t*)endiandata;
-   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   // precalc midstate for keccak
 //   keccak512_4way_init( &jha_kec_mid );
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -3,15 +3,16 @@

 bool register_jha_algo( algo_gate_t* gate )
 {
-//#if defined (JHA_4WAY)
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-//  gate->scanhash         = (void*)&scanhash_jha_4way;
-//  gate->hash             = (void*)&jha_hash_4way;
-//#else
-  gate->optimizations = SSE2_OPT | AES_OPT;
+#if defined (JHA_4WAY)
+  four_way_not_tested();
+  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
+  gate->scanhash         = (void*)&scanhash_jha_4way;
+  gate->hash             = (void*)&jha_hash_4way;
+#else
+  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
  gate->scanhash         = (void*)&scanhash_jha;
  gate->hash             = (void*)&jha_hash;
-//#endif
+#endif
  gate->set_target       = (void*)&scrypt_set_target;
  return true;
 };
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -9,19 +9,17 @@
  #define JHA_4WAY
 #endif

-//#if defined JHA_4WAY
-//void jha_hash_4way( void *state, const void *input );
+#if defined JHA_4WAY
+void jha_hash_4way( void *state, const void *input );

-//int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
-//                       uint64_t *hashes_done );
-//#else
+int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+#endif

 void jha_hash( void *state, const void *input );

 int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done );

-//#endif
-
 #endif

--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -1,39 +1,30 @@
 #include "keccak-gate.h"
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "sph_keccak.h"
-#include "keccak-hash-4way.h"

 #ifdef KECCAK_4WAY

+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "sph_keccak.h"
+#include "keccak-hash-4way.h"
+
 void keccakhash_4way(void *state, const void *input)
 {
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     keccak256_4way_context ctx;
+    uint64_t vhash[4*4] __attribute__ ((aligned (64)));
+    keccak256_4way_context ctx;

-     keccak256_4way_init( &ctx );
-     keccak256_4way( &ctx, input, 80 );
-     keccak256_4way_close( &ctx, vhash );
+    keccak256_4way_init( &ctx );
+    keccak256_4way( &ctx, input, 80 );
+    keccak256_4way_close( &ctx, vhash );

-     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done)
 {
-   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
@@ -52,7 +43,7 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( &endiandata[i], pdata[i] );

   uint64_t *edata = (uint64_t*)endiandata;
-   m256_interleave_4x64x( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   do {
      found[0] = found[1] = found[2] = found[3] = false;
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -9,19 +9,38 @@ int64_t keccak_get_max64() { return 0x7ffffLL; }

 bool register_keccak_algo( algo_gate_t* gate )
 {
+  gate->optimizations = FOUR_WAY_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  gate->set_target      = (void*)&keccak_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
 #if defined (KECCAK_4WAY)
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
-  gate->optimizations = SSE2_OPT;
  gate->scanhash        = (void*)&scanhash_keccak;
  gate->hash            = (void*)&keccakhash;
 #endif
  return true;
 };

+void keccakc_set_target( struct work* work, double job_diff )
+{
+  work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_keccakc_algo( algo_gate_t* gate )
+{
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
+  gate->set_target      = (void*)&keccakc_set_target;
+  gate->get_max64       = (void*)&keccak_get_max64;
+#if defined (KECCAK_4WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_4way;
+  gate->hash      = (void*)&keccakhash_4way;
+#else
+  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->hash            = (void*)&keccakhash;
+#endif
+  return true;
+};

--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -1,5 +1,5 @@
-#ifndef __KECCAK_GATE_H__
-#define __KECCAK_GATE_H__
+#ifndef KECCAK_GATE_H__
+#define KECCAK_GATE_H__

 #include "algo-gate-api.h"
 #include <stdint.h>
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -54,10 +54,6 @@ static const sph_u64 RC[] = {
        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
 } while (0)

-#define mm256_neg1 \
-        (_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
-                            0xffffffffffffffff, 0xffffffffffffffff ) )
-
 #define DECL64(x)        __m256i x
 #define MOV64(d, s)      (d = s)
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
@@ -403,7 +399,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,

    if ( len < (lim - ptr) )
    {
-        memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+        memcpy_256( buf + (ptr>>3), vdata, len>>3 );
        kc->ptr = ptr + len;
        return;
    }
@@ -416,7 +412,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
        clen = (lim - ptr);
        if ( clen > len )
             clen = len;
-        memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+        memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
        ptr += clen;
        vdata = vdata + (clen>>3);
        len -= clen;
@@ -453,7 +449,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    {
        j = lim - kc->ptr;
        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
-        memset_zero_m256i( u.tmp + 1, (j>>3) - 2 );
+        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
        u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
    }
@@ -467,7 +463,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    NOT64( kc->w[20], kc->w[20] );
    for ( j = 0; j < m256_len; j++ )
         u.tmp[j] =  kc->w[j]; 
-    memcpy_m256i( dst, u.tmp, m256_len );
+    memcpy_256( dst, u.tmp, m256_len );
 }

 void keccak256_4way_init( void *kc )
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
+                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
+                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_byteswap_epi32( cast_m128i( data ) ) );
+                      mm_byteswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );

    rnd512( state, zero, zero );

@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 1 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
 }

 #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );

    rnd512( state, zero, zero );

@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
 }
 #endif

--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -377,7 +377,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;
-
+/*
 #if defined (__AVX2__)
   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
 #elif defined(__AVX__)
@@ -385,7 +385,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
 #else
   memset(wholeMatrix, 0, i);
 #endif
-
+*/
   uint64_t *ptrWord = wholeMatrix;

   //=== Getting the password + salt + basil padded with 10*1 ==========//
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -128,34 +128,10 @@ void lyra2re_set_target ( struct work* work, double job_diff )
   work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
 }

-/*
-bool lyra2re_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   lyra2re_wholeMatrix = _mm_malloc( i, 64 );
-
-   if ( lyra2re_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
-#elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
-#else
-   memset( lyra2re_wholeMatrix, 0, i );
-#endif
-   return true;
-}
-*/
-
 bool register_lyra2re_algo( algo_gate_t* gate )
 {
  init_lyra2re_ctx();
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-//  gate->miner_thread_init = (void*)&lyra2re_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2re;
  gate->hash       = (void*)&lyra2re_hash;
  gate->get_max64  = (void*)&lyra2re_get_max64;
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -132,23 +132,13 @@ bool lyra2rev2_thread_init()
   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v2_wholeMatrix = _mm_malloc( i, 64 );

-   if ( l2v2_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
-#elif defined (__AVX__)
-   memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
-#else
-   memset( l2v2_wholeMatrix, 0, i );
-#endif
-   return true;
+   return l2v2_wholeMatrix;
 }

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
  init_lyra2rev2_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2rev2;
  gate->hash              = (void*)&lyra2rev2_hash;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -0,0 +1,168 @@
+#include "lyra2z-gate.h"
+
+#ifdef LYRA2Z_4WAY
+
+#include <memory.h>
+#include <mm_malloc.h>
+//#include "algo-gate-api.h"
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake-hash-4way.h"
+//#include "avxdefs.h"
+
+// same size, only difference is the name, lyra2 is done serially
+__thread uint64_t* lyra2z_4way_matrix;
+
+bool lyra2z_4way_thread_init()
+{
+ return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_4way_context l2z_4way_blake_mid;
+
+void lyra2z_4way_midstate( const void* input )
+{
+       blake256_4way_init( &l2z_4way_blake_mid );
+       blake256_4way( &l2z_4way_blake_mid, input, 64 );
+}
+
+// block 2050 new algo, blake plus new lyra parms. new input
+// is power of 2 so normal lyra can be used
+//void zcoin_hash(void *state, const void *input, uint32_t height)
+void lyra2z_4way_hash( void *state, const void *input )
+{
+//        uint32_t _ALIGN(64) hash[16];
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+
+//     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
+//     blake256_4way( &ctx_blake, input + (64*4), 16 );
+//     blake256_4way_close( &ctx_blake, vhash );
+
+     blake256_4way_init( &ctx_blake );
+     blake256_4way( &ctx_blake, input, 80 );
+     blake256_4way_close( &ctx_blake, vhash );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+//     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+//     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+
+//    memcpy(state, hash, 32);
+}
+
+int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+//	uint32_t _ALIGN(64) hash[8];
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+//   lyra2z_4way_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      be32enc( &edata[19], n );
+      lyra2z_4way_hash( hash, vdata );
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+printf("found 0\n");
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+/*      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+printf("found 1\n");          
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+*/
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+printf("found 2\n");          
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+/*
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+printf("found 3\n");          
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+*/
+      n += 2;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
+/*
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+*/
+
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -0,0 +1,28 @@
+#include "lyra2z-gate.h"
+#include "lyra2.h"
+
+void lyra2z_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2z_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2Z_4WAY
+  four_way_not_tested();
+  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
+  gate->hash       = (void*)&lyra2z_4way_hash;
+#else
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2z_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z;
+  gate->hash       = (void*)&lyra2z_hash;
+#endif
+
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2z_set_target;
+  return true;
+};
+
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -0,0 +1,33 @@
+#ifndef LYRA2Z_GATE_H__
+#define LYRA2Z_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY)
+  #define LYRA2Z_4WAY
+#endif
+
+
+#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
+
+#if defined(LYRA2Z_4WAY)
+
+void lyra2z_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+bool lyra2z_4way_thread_init();
+
+#endif
+
+void lyra2z_hash( void *state, const void *input );
+
+int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+bool lyra2z_thread_init();
+
+#endif
+
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -1,40 +1,49 @@
 #include <memory.h>
 #include <mm_malloc.h>
-#include "algo-gate-api.h"
+#include "lyra2z-gate.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "avxdefs.h"

-__thread uint64_t* zcoin_wholeMatrix;
+__thread uint64_t* lyra2z_matrix;

-static __thread sph_blake256_context zcoin_blake_mid;
-
-
-void zcoin_midstate( const void* input )
+bool lyra2z_thread_init()
 {
-       sph_blake256_init( &zcoin_blake_mid );
-       sph_blake256( &zcoin_blake_mid, input, 64 );
+//   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
+   const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
+   lyra2z_matrix = _mm_malloc( i, 64 );
+   return lyra2z_matrix;
+}
+
+static __thread sph_blake256_context lyra2z_blake_mid;
+
+void lyra2z_midstate( const void* input )
+{
+       sph_blake256_init( &lyra2z_blake_mid );
+       sph_blake256( &lyra2z_blake_mid, input, 64 );
 }

 // block 2050 new algo, blake plus new lyra parms. new input
 // is power of 2 so normal lyra can be used
 //void zcoin_hash(void *state, const void *input, uint32_t height)
-void zcoin_hash(void *state, const void *input )
+void lyra2z_hash( void *state, const void *input )
 {
        uint32_t _ALIGN(64) hash[16];

        sph_blake256_context ctx_blake __attribute__ ((aligned (64)));

-        memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
+        memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid );
        sph_blake256( &ctx_blake, input + 64, 16 );
        sph_blake256_close( &ctx_blake, hash );

-        LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
+        LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);

    memcpy(state, hash, 32);
 }

-int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done )
 {
 	uint32_t _ALIGN(64) hash[8];
@@ -52,11 +61,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-        zcoin_midstate( endiandata );
+        lyra2z_midstate( endiandata );

 	do {
 		be32enc(&endiandata[19], nonce);
-                zcoin_hash( hash, endiandata );
+                lyra2z_hash( hash, endiandata );

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			work_set_target_ratio(work, hash);
@@ -73,50 +82,41 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

+/*
 //int64_t get_max64_0xffffLL() { return 0xffffLL; };

-void zcoin_set_target( struct work* work, double job_diff )
+void lyra2z_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
-/*
+
 bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
 {
   work->height = sctx->bloc_height;
   return false;
 }
-*/

-bool zcoin_thread_init()
+
+bool lyra2z_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   zcoin_wholeMatrix = _mm_malloc( i, 64 );
+   lyra2z_wholeMatrix = _mm_malloc( i, 64 );

-   if ( zcoin_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
-#elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
-#else
-   memset( zcoin_wholeMatrix, 0, i );
-#endif
-   return true;
+   return lyra2z_wholeMatrix;
 }

-bool register_zcoin_algo( algo_gate_t* gate )
+bool register_lyra2z_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&zcoin_thread_init;
-  gate->scanhash   = (void*)&scanhash_zcoin;
-  gate->hash       = (void*)&zcoin_hash;
+  gate->miner_thread_init = (void*)&lyra2z_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z;
+  gate->hash       = (void*)&lyra2z_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&zcoin_set_target;
+  gate->set_target = (void*)&lyra2z_set_target;
 //  gate->prevent_dupes = (void*)&zcoin_get_work_height;
  return true;
 };
-
+*/
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -64,22 +64,12 @@ bool lyra2z330_thread_init()
   int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
   lyra2z330_wholeMatrix = _mm_malloc( i, 64 );

-   if ( lyra2z330_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)lyra2z330_wholeMatrix, i/32 );
-#elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)lyra2z330_wholeMatrix, i/16 );
-#else
-   memset( lyra2z330_wholeMatrix, 0, i );
-#endif
-   return true;
+   return lyra2z330_wholeMatrix;
 }

 bool register_lyra2z330_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z330_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z330;
  gate->hash       = (void*)&lyra2z330_hash;
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -130,12 +130,12 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_m256i( out, state, BLOCK_LEN_M256I );
+       memcpy_256( out, state, BLOCK_LEN_M256I );
       LYRA_ROUND_AVX2( state[0], state[1], state[2], state[3] );
       out += BLOCK_LEN_M256I;
    }
    //Squeezes remaining bytes
-    memcpy_m256i( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
+    memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );

 #elif defined (__AVX__)

@@ -148,13 +148,13 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_m128i( out, state, BLOCK_LEN_M128I );
+       memcpy_128( out, state, BLOCK_LEN_M128I );
       LYRA_ROUND_AVX( state[0], state[1], state[2], state[3],
                       state[4], state[5], state[6], state[7] );
       out += BLOCK_LEN_M128I;
    }
    //Squeezes remaining bytes
-    memcpy_m128i( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
+    memcpy_128( out, state, ( len_m128i % BLOCK_LEN_M128I ) );

 #else

--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -66,11 +66,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
   s1 = mm256_rotl256_1x64( s1); \
-   s2 = mm256_swap128( s2 ); \
+   s2 = mm256_swap_128( s2 ); \
   s3 = mm256_rotr256_1x64( s3 ); \
   G_4X64( s0, s1, s2, s3 ); \
   s1 = mm256_rotr256_1x64( s1 ); \
-   s2 = mm256_swap128( s2 ); \
+   s2 = mm256_swap_128( s2 ); \
   s3 = mm256_rotl256_1x64( s3 );

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
@@ -105,14 +105,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rotl256_1x64( s2, s3 ); \
-   mm128_swap128( s4, s5 ); \
-   mm128_rotr256_1x64( s6, s7 ); \
+   mm_rotl256_1x64( s2, s3 ); \
+   mm_swap_128( s4, s5 ); \
+   mm_rotr256_1x64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rotr256_1x64( s2, s3 ); \
-   mm128_swap128( s4, s5 ); \
-   mm128_rotl256_1x64( s6, s7 );
+   mm_rotr256_1x64( s2, s3 ); \
+   mm_swap_128( s4, s5 ); \
+   mm_rotl256_1x64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -15,7 +15,7 @@
 // no improvement with midstate
 //static __thread blake512_4way_context ctx_mid;

-void nist5hash_4way( void *output, const void *input )
+void nist5hash_4way( void *out, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
@@ -35,7 +35,7 @@ void nist5hash_4way( void *output, const void *input )
     blake512_4way( &ctx_blake, input, 80 );
     blake512_4way_close( &ctx_blake, vhash );

-     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx_groestl, 64 );
     update_and_final_groestl( &ctx_groestl, (char*)hash0,
@@ -50,7 +50,7 @@ void nist5hash_4way( void *output, const void *input )
     update_and_final_groestl( &ctx_groestl, (char*)hash3,
                               (const char*)hash3, 512 );

-     m256_interleave_4x64x( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     jh512_4way_init( &ctx_jh );
     jh512_4way( &ctx_jh, vhash, 64 );
@@ -64,12 +64,7 @@ void nist5hash_4way( void *output, const void *input )
     skein512_4way( &ctx_skein, vhash, 64 );
     skein512_4way_close( &ctx_skein, vhash );

-     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     memcpy( output,       hash0, 32 );
-     memcpy( output+32,    hash1, 32 );
-     memcpy( output+64,    hash2, 32 );
-     memcpy( output+96,    hash3, 32 );
+     mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
 }

 int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -109,7 +104,7 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
     swab32_array( endiandata, pdata, 20 );

     uint64_t *edata = (uint64_t*)endiandata;
-     m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

     // precalc midstate
 //     blake512_4way_init( &ctx_mid );
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,12 +2,11 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
+    gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
 #if defined (NIST5_4WAY)
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
 #else
-    gate->optimizations = SSE2_OPT | AES_OPT;
    init_nist5_ctx();
    gate->scanhash = (void*)&scanhash_nist5;
    gate->hash     = (void*)&nist5hash;
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -20,7 +20,7 @@ void skeinhash_4way( void *state, const void *input )
     skein512_4way( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash );

-     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
@@ -38,21 +38,20 @@ void skeinhash_4way( void *state, const void *input )
     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );

-     memcpy(  (char*)state,       (char*)hash0, 32 );
-     memcpy( ((char*)state) + 32, (char*)hash1, 32 );
-     memcpy( ((char*)state) + 64, (char*)hash2, 32 );
-     memcpy( ((char*)state) + 96, (char*)hash3, 32 );
+     memcpy( state,      hash0, 32 );
+     memcpy( state + 32, hash1, 32 );
+     memcpy( state + 64, hash2, 32 );
+     memcpy( state + 96, hash3, 32 );
 }

 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done )
 {
-    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    uint32_t endiandata[20] __attribute__ ((aligned (64)));
+    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
-    uint64_t *edata = (uint64_t*)endiandata;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
@@ -63,9 +62,9 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,

 // data is 80 bytes, 20 u32 or 4 u64.
 	
-    swab32_array( endiandata, pdata, 20 );
+    swab32_array( edata, pdata, 20 );
 
-    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );

    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
    uint32_t *noncep1 = vdata + 75;
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -6,8 +6,8 @@ int64_t skein_get_max64() { return 0x7ffffLL; }

 bool register_skein_algo( algo_gate_t* gate )
 {
+    gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
 #if defined (SKEIN_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -463,7 +463,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,

   if ( len <= buf_size - ptr )
   {
-       memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+       memcpy_256( buf + (ptr>>3), vdata, len>>3 );
       sc->ptr = ptr + len;
       return;
   }
@@ -483,7 +483,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
       clen = buf_size - ptr;
       if ( clen > len )
            clen = len;
-       memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+       memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
       ptr += clen;
       vdata += (clen>>3);
       len -= clen;
@@ -520,11 +520,11 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,

 	READ_STATE_BIG(sc);

-        memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+        memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
 	et = 352 + ((bcount == 0) << 7);
        UBI_BIG_4WAY( et, ptr );

-        memset_zero_m256i( buf, buf_size >> 3 );
+        memset_zero_256( buf, buf_size >> 3 );
        bcount = 0;
        UBI_BIG_4WAY( 510, 8 );

@@ -537,7 +537,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
        buf[6] = h6;
        buf[7] = h7;

-        memcpy_m256i( dst, buf, out_len >> 3 );
+        memcpy_256( dst, buf, out_len >> 3 );
 }

 static const sph_u64 IV256[] = {
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -19,13 +19,13 @@ void skein2hash_4way( void *output, const void *input )
   skein512_4way( &ctx, hash, 64 );
   skein512_4way_close( &ctx, hash );

-   m256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
+   mm256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
 }

 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done )
 {
-    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__ ((aligned (64)));
    uint64_t *edata = (uint64_t*)endiandata;
@@ -41,7 +41,7 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,

    swab32_array( endiandata, pdata, 20 );

-    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );

    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
    uint32_t *noncep1 = vdata + 75;
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -9,12 +9,12 @@ int64_t skein2_get_max64 ()

 bool register_skein2_algo( algo_gate_t* gate )
 {
+  gate->optimizations = FOUR_WAY_OPT;
 #if defined (FOUR_WAY) && defined (__AVX2__)
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
+  four_way_not_tested();
 #else
-  gate->optimizations = SSE2_OPT;
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
 #endif
--- a/algo/tribus/tribus-4way.c
+++ b/algo/tribus/tribus-4way.c
@@ -31,7 +31,7 @@ void tribus_hash_4way(void *state, const void *input)
     keccak512_4way( &ctx_keccak, vhash, 64 );
     keccak512_4way_close( &ctx_keccak, vhash );

-     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     // hash echo serially
     init_echo( &ctx_echo, 512 );
@@ -92,7 +92,7 @@ int scanhash_tribus_4way(int thr_id, struct work *work, uint32_t max_nonce, uint
   }

   uint64_t *edata = (uint64_t*)endiandata;
-   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   // precalc midstate
   // doing it one way then then interleaving would be faster but too
--- a/algo/tribus/tribus-gate.c
+++ b/algo/tribus/tribus-gate.c
@@ -14,15 +14,13 @@ bool tribus_thread_init()
 */
 bool register_tribus_algo( algo_gate_t* gate )
 {
-//  gate->miner_thread_init = (void*)&tribus_thread_init;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64     = (void*)&get_max64_0x1ffff;
 #if defined (TRIBUS_4WAY)
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->scanhash      = (void*)&scanhash_tribus_4way;
  gate->hash          = (void*)&tribus_hash_4way;
 #else
  gate->miner_thread_init = (void*)&tribus_thread_init;
-  gate->optimizations = SSE2_OPT | AES_OPT;
  gate->scanhash      = (void*)&scanhash_tribus;
  gate->hash          = (void*)&tribus_hash;
 #endif
--- a/algo/whirlpool/md-helper-4way.c
+++ b/algo/whirlpool/md-helper-4way.c
@@ -140,7 +140,7 @@ HASH ( void *cc, const void *data, size_t len )
      clen = SPH_BLEN - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_m256i( sc->buf + (ptr>>3), vdata, clen>>3 );
+      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
      vdata = vdata + (clen>>3);
      ptr += clen;
      len -= clen;
@@ -195,19 +195,19 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
    sc = cc;
    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);

-uint64_t *b= (uint64_t*)sc->buf;
-uint64_t *s= (uint64_t*)sc->state;
+//uint64_t *b= (uint64_t*)sc->buf;
+//uint64_t *s= (uint64_t*)sc->state;
 //printf("Vptr 1= %u\n", ptr);
 //printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
 //printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );

 #ifdef PW01
-    sc->buf[ptr>>3] = mm256_vec_epi64( 0x100 >> 8 );
+    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
 //    sc->buf[ptr++] = 0x100 >> 8;
 #else
 // need to overwrite exactly one byte
 //    sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
-    sc->buf[ptr>>3] = mm256_vec_epi64( 0x80 );
+    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
 //    ptr++;
 #endif
    ptr += 8;
@@ -218,43 +218,43 @@ uint64_t *s= (uint64_t*)sc->state;

    if ( ptr > SPH_MAXPAD )
    {
-         memset_zero_m256i( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
+         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
         RFUN( sc->buf, SPH_VAL );
-         memset_zero_m256i( sc->buf, SPH_MAXPAD >> 3 );
+         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
    }
    else
    {
-         memset_zero_m256i( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
+         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
    }
 #if defined BE64
 #if defined PLW1
    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
+                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #elif defined PLW4
-    memset_zero_m256i( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
+    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
+                mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
+                mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #else
    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
+               mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
+               mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #endif  // PLW
 #else  // LE64
 #if defined PLW1
-    sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
 #elif defined PLW4
-    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_vec_epi64( sc->count << 3 );
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                       mm256_vec_epi64( c->count >> 61 );
-    memset_zero_m256i( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
+                       _mm256_set1_epi64x( c->count >> 61 );
+    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
                       2 * SPH_WLEN );
 #else
-    sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
-                          mm256_vec_epi64( sc->count >> 61 );
+                          _mm256_set1_epi64x( sc->count >> 61 );
 #endif // PLW

 #endif // LE64
@@ -276,7 +276,7 @@ uint64_t *s= (uint64_t*)sc->state;
    for ( u = 0; u < rnum; u ++ )
    {
 #if defined BE64
-       ((__m256i*)dst)[u] = mm256_byteswap_epi64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
 #else  // LE64
       ((__m256i*)dst)[u] = sc->val[u];
 #endif
--- a/algo/whirlpool/sph_whirlpool.c.bak
+++ b/algo/whirlpool/sph_whirlpool.c.bak
--- a/algo/whirlpool/sph_whirlpool.h.bak
+++ b/algo/whirlpool/sph_whirlpool.h.bak
@@ -1,209 +0,0 @@
-/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * WHIRLPOOL interface.
- *
- * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
- * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
- * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
- * version, 2003, with a new diffusion matrix, also described as "plain
- * WHIRLPOOL"). All three variants are implemented here.
- *
- * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
- * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
- * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
- *
- * The current WHIRLPOOL specification and a reference implementation
- * can be found on the WHIRLPOOL web page:
- * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_whirlpool.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_WHIRLPOOL_H__
-#define SPH_WHIRLPOOL_H__
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-
-#if SPH_64
-
-/**
- * Output size (in bits) for WHIRLPOOL.
- */
-#define SPH_SIZE_whirlpool   512
-
-/**
- * Output size (in bits) for WHIRLPOOL-0.
- */
-#define SPH_SIZE_whirlpool0   512
-
-/**
- * Output size (in bits) for WHIRLPOOL-1.
- */
-#define SPH_SIZE_whirlpool1   512
-
-/**
- * This structure is a context for WHIRLPOOL computations: it contains the
- * intermediate values and some data from the last entered block. Once
- * a WHIRLPOOL computation has been performed, the context can be reused for
- * another computation.
- *
- * The contents of this structure are private. A running WHIRLPOOL computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
-	sph_u64 state[8];
-#if SPH_64
-	sph_u64 count;
-#else
-	sph_u32 count_high, count_low;
-#endif
-#endif
-} sph_whirlpool_context;
-
-/**
- * Initialize a WHIRLPOOL context. This process performs no memory allocation.
- *
- * @param cc   the WHIRLPOOL context (pointer to a
- *             <code>sph_whirlpool_context</code>)
- */
-void sph_whirlpool_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing). This function applies the
- * plain WHIRLPOOL algorithm.
- *
- * @param cc     the WHIRLPOOL context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_whirlpool(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current WHIRLPOOL computation and output the result into the
- * provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the WHIRLPOOL context
- * @param dst   the destination buffer
- */
-void sph_whirlpool_close(void *cc, void *dst);
-
-/**
- * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
- */
-typedef sph_whirlpool_context sph_whirlpool0_context;
-
-#ifdef DOXYGEN_IGNORE
-/**
- * Initialize a WHIRLPOOL-0 context. This function is identical to
- * <code>sph_whirlpool_init()</code>.
- *
- * @param cc   the WHIRLPOOL context (pointer to a
- *             <code>sph_whirlpool0_context</code>)
- */
-void sph_whirlpool0_init(void *cc);
-#endif
-
-#ifndef DOXYGEN_IGNORE
-#define sph_whirlpool0_init   sph_whirlpool_init
-#endif
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing). This function applies the
- * WHIRLPOOL-0 algorithm.
- *
- * @param cc     the WHIRLPOOL context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_whirlpool0(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current WHIRLPOOL-0 computation and output the result into the
- * provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the WHIRLPOOL-0 context
- * @param dst   the destination buffer
- */
-void sph_whirlpool0_close(void *cc, void *dst);
-
-/**
- * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
- */
-typedef sph_whirlpool_context sph_whirlpool1_context;
-
-#ifdef DOXYGEN_IGNORE
-/**
- * Initialize a WHIRLPOOL-1 context. This function is identical to
- * <code>sph_whirlpool_init()</code>.
- *
- * @param cc   the WHIRLPOOL context (pointer to a
- *             <code>sph_whirlpool1_context</code>)
- */
-void sph_whirlpool1_init(void *cc);
-#endif
-
-#ifndef DOXYGEN_IGNORE
-#define sph_whirlpool1_init   sph_whirlpool_init
-#endif
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing). This function applies the
- * WHIRLPOOL-1 algorithm.
- *
- * @param cc     the WHIRLPOOL context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_whirlpool1(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current WHIRLPOOL-1 computation and output the result into the
- * provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the WHIRLPOOL-1 context
- * @param dst   the destination buffer
- */
-void sph_whirlpool1_close(void *cc, void *dst);
-
-#endif
-
-#endif
--- a/algo/whirlpool/whirlpool-4way.c
+++ b/algo/whirlpool/whirlpool-4way.c
@@ -41,7 +41,7 @@ void whirlpool_hash_4way( void *state, const void *input )
     whirlpool1_4way( &ctx, vhash, 64 );
     whirlpool1_4way_close( &ctx, vhash);

-     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

     memcpy( state   , hash0, 32 );
     memcpy( state+32, hash1, 32 );
@@ -74,7 +74,7 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
      be32enc(&endiandata[i], pdata[i]);

   uint64_t *edata = (uint64_t*)endiandata;
-   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   // midstate
   whirlpool1_4way_init( &whirl_mid );
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3346,7 +3346,7 @@ do { \
 #define ROUND0         MUL8(ROUND0_W)
 #define UPDATE_STATE   MUL8(UPDATE_STATE_W)
 #define BYTE(x, n) \
-   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), mm256_vec_epi64( 0xFF ) )
+   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )

 // A very complex, but structured, expression with a mix of scalar
 // and vector operations to retrieve specific 64 bit constants from
@@ -3359,7 +3359,7 @@ do { \
 // Pack the data in a vector and return it.
 #define t_row( inv, row ) \
   _mm256_and_si256( \
-        _mm256_srli_epi64( inv, row << 3 ), mm256_vec_epi64( 0xFF ) )
+        _mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )

 // Extract vector element from "lane" of vector "in[row]" and use it to index
 // scalar array of constants "table" and return referenced 64 bit entry.
@@ -3454,7 +3454,7 @@ void
 whirlpool_4way_init(void *cc)
 {
 	whirlpool_4way_context *sc = cc;;
-        memset_zero_m256i( sc->state, 8 );
+        memset_zero_256( sc->state, 8 );
 	sc->count = 0;
 }

@@ -3470,7 +3470,7 @@ name ## _round( const void *src, __m256i *state ) \
   ROUND0; \
   for (r = 0; r < 10; r ++) { \
      DECL8(tmp); \
-      ROUND_KSCHED( type ## _T, h, tmp, mm256_vec_epi64( type ## _RC[r] ) ); \
+      ROUND_KSCHED( type ## _T, h, tmp, _mm256_set1_epi64x( type ## _RC[r] ) ); \
      TRANSFER( h, tmp ); \
      ROUND_WENC( type ## _T, n, h, tmp ); \
      TRANSFER( n, tmp ); \