v3.9.0

2025-09-17 23:44:27 +00:00 · 2019-05-19 13:39:45 -04:00
parent bfd1c002f9
commit e1aead3c76
139 changed files with 10907 additions and 4218 deletions
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -15,7 +15,7 @@ void blakehash_4way(void *state, const void *input)
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -37,7 +37,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

   // we need big endian data...
   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256r14_4way_init( &blake_4w_ctx );
   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -363,14 +363,14 @@ static const sph_u64 CB[16] = {
 do { \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
                 _mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ), 12 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
                 _mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ), 8 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ), 7 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
 } while (0)

 #if SPH_COMPACT_BLAKE_32
@@ -562,22 +562,22 @@ do { \
                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm_bswap_32( *(buf +  0) ); \
-	M[0x1] = mm_bswap_32( *(buf +  1) ); \
-	M[0x2] = mm_bswap_32( *(buf +  2) ); \
-	M[0x3] = mm_bswap_32( *(buf +  3) ); \
-	M[0x4] = mm_bswap_32( *(buf +  4) ); \
-	M[0x5] = mm_bswap_32( *(buf +  5) ); \
-	M[0x6] = mm_bswap_32( *(buf +  6) ); \
-	M[0x7] = mm_bswap_32( *(buf +  7) ); \
-	M[0x8] = mm_bswap_32( *(buf +  8) ); \
-	M[0x9] = mm_bswap_32( *(buf +  9) ); \
-	M[0xA] = mm_bswap_32( *(buf + 10) ); \
-	M[0xB] = mm_bswap_32( *(buf + 11) ); \
-	M[0xC] = mm_bswap_32( *(buf + 12) ); \
-	M[0xD] = mm_bswap_32( *(buf + 13) ); \
-	M[0xE] = mm_bswap_32( *(buf + 14) ); \
-	M[0xF] = mm_bswap_32( *(buf + 15) ); \
+	M[0x0] = mm128_bswap_32( *(buf +  0) ); \
+	M[0x1] = mm128_bswap_32( *(buf +  1) ); \
+	M[0x2] = mm128_bswap_32( *(buf +  2) ); \
+	M[0x3] = mm128_bswap_32( *(buf +  3) ); \
+	M[0x4] = mm128_bswap_32( *(buf +  4) ); \
+	M[0x5] = mm128_bswap_32( *(buf +  5) ); \
+	M[0x6] = mm128_bswap_32( *(buf +  6) ); \
+	M[0x7] = mm128_bswap_32( *(buf +  7) ); \
+	M[0x8] = mm128_bswap_32( *(buf +  8) ); \
+	M[0x9] = mm128_bswap_32( *(buf +  9) ); \
+	M[0xA] = mm128_bswap_32( *(buf + 10) ); \
+	M[0xB] = mm128_bswap_32( *(buf + 11) ); \
+	M[0xC] = mm128_bswap_32( *(buf + 12) ); \
+	M[0xD] = mm128_bswap_32( *(buf + 13) ); \
+	M[0xE] = mm128_bswap_32( *(buf + 14) ); \
+	M[0xF] = mm128_bswap_32( *(buf + 15) ); \
 	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -624,22 +624,22 @@ do { \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm_bswap_32( * buf ); \
-   M1 = mm_bswap_32( *(buf+1) ); \
-   M2 = mm_bswap_32( *(buf+2) ); \
-   M3 = mm_bswap_32( *(buf+3) ); \
-   M4 = mm_bswap_32( *(buf+4) ); \
-   M5 = mm_bswap_32( *(buf+5) ); \
-   M6 = mm_bswap_32( *(buf+6) ); \
-   M7 = mm_bswap_32( *(buf+7) ); \
-   M8 = mm_bswap_32( *(buf+8) ); \
-   M9 = mm_bswap_32( *(buf+9) ); \
-   MA = mm_bswap_32( *(buf+10) ); \
-   MB = mm_bswap_32( *(buf+11) ); \
-   MC = mm_bswap_32( *(buf+12) ); \
-   MD = mm_bswap_32( *(buf+13) ); \
-   ME = mm_bswap_32( *(buf+14) ); \
-   MF = mm_bswap_32( *(buf+15) ); \
+   M0 = mm128_bswap_32( * buf ); \
+   M1 = mm128_bswap_32( *(buf+1) ); \
+   M2 = mm128_bswap_32( *(buf+2) ); \
+   M3 = mm128_bswap_32( *(buf+3) ); \
+   M4 = mm128_bswap_32( *(buf+4) ); \
+   M5 = mm128_bswap_32( *(buf+5) ); \
+   M6 = mm128_bswap_32( *(buf+6) ); \
+   M7 = mm128_bswap_32( *(buf+7) ); \
+   M8 = mm128_bswap_32( *(buf+8) ); \
+   M9 = mm128_bswap_32( *(buf+9) ); \
+   MA = mm128_bswap_32( *(buf+10) ); \
+   MB = mm128_bswap_32( *(buf+11) ); \
+   MC = mm128_bswap_32( *(buf+12) ); \
+   MD = mm128_bswap_32( *(buf+13) ); \
+   ME = mm128_bswap_32( *(buf+14) ); \
+   MF = mm128_bswap_32( *(buf+15) ); \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
   ROUND_S_4WAY(2); \
@@ -1073,8 +1073,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
       if (out_size_w32 == 8)
           buf[52>>2] = _mm_or_si128( buf[52>>2],
                                        _mm_set1_epi32( 0x01000000UL ) );
-       *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-       *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+       *(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
+       *(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
@@ -1086,13 +1086,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 	memset_zero_128( buf, 56>>2 );
       if (out_size_w32 == 8)
           buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
-        *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-        *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+        *(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
+        *(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm_bswap_32( sc->H[k] );
+        out[k] = mm128_bswap_32( sc->H[k] );
 }

 #if defined (__AVX2__)
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -85,7 +85,8 @@ void blake2s_4way_hash( void *output, const void *input )
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );

-   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
+   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
+		            vhash, 256 );
 }

 int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -104,7 +105,7 @@ int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *noncep = vdata + 76;   // 19*4

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -92,13 +92,13 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
 #define G4W(r,i,a,b,c,d) \
 do { \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ), 12 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ),  8 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ),  8 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ),  7 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ),  7 ); \
 } while(0)

 #define ROUND4W(r)  \
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -17,7 +17,7 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256r8_4way_init( &blakecoin_4w_ctx );
   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );

--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -23,7 +23,7 @@ void decred_hash_4way( void *state, const void *input )
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -44,7 +44,7 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
+   mm128_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -140,6 +140,7 @@ bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
   return true;
 }

+int decred_get_work_data_size() { return DECRED_DATA_SIZE; }

 bool register_decred_algo( algo_gate_t* gate )
 {
@@ -154,7 +155,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->optimizations = AVX2_OPT;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
-  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->decode_extra_data     = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
@@ -163,7 +164,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->nbits_index           = DECRED_NBITS_INDEX;
  gate->ntime_index           = DECRED_NTIME_INDEX;
  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
+  gate->get_work_data_size    = (void*)&decred_get_work_data_size;
  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
  allow_mininginfo            = false;
  have_gbt                    = false;
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -268,7 +268,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&decred_hash;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
-  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->decode_extra_data     = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -77,26 +77,26 @@ static const sph_u64 IV512[] = {
 #define ss0(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
                                 _mm_slli_epi32( (x), 3) ), \
-                  _mm_xor_si128( mm_rol_32( (x),  4), \
-                                 mm_rol_32( (x), 19) ) )
+                  _mm_xor_si128( mm128_rol_32( (x),  4), \
+                                 mm128_rol_32( (x), 19) ) )

 #define ss1(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
                                 _mm_slli_epi32( (x), 2) ), \
-                  _mm_xor_si128( mm_rol_32( (x),  8), \
-                                 mm_rol_32( (x), 23) ) )
+                  _mm_xor_si128( mm128_rol_32( (x),  8), \
+                                 mm128_rol_32( (x), 23) ) )

 #define ss2(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
                                 _mm_slli_epi32( (x), 1) ), \
-                  _mm_xor_si128( mm_rol_32( (x), 12), \
-                                 mm_rol_32( (x), 25) ) )
+                  _mm_xor_si128( mm128_rol_32( (x), 12), \
+                                 mm128_rol_32( (x), 25) ) )

 #define ss3(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
                                 _mm_slli_epi32( (x), 2) ), \
-                  _mm_xor_si128( mm_rol_32( (x), 15), \
-                                 mm_rol_32( (x), 29) ) )
+                  _mm_xor_si128( mm128_rol_32( (x), 15), \
+                                 mm128_rol_32( (x), 29) ) )

 #define ss4(x) \
  _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
@@ -104,16 +104,16 @@ static const sph_u64 IV512[] = {
 #define ss5(x) \
  _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )

-#define rs1(x)    mm_rol_32( x,  3 ) 
-#define rs2(x)    mm_rol_32( x,  7 ) 
-#define rs3(x)    mm_rol_32( x, 13 ) 
-#define rs4(x)    mm_rol_32( x, 16 ) 
-#define rs5(x)    mm_rol_32( x, 19 ) 
-#define rs6(x)    mm_rol_32( x, 23 ) 
-#define rs7(x)    mm_rol_32( x, 27 ) 
+#define rs1(x)    mm128_rol_32( x,  3 ) 
+#define rs2(x)    mm128_rol_32( x,  7 ) 
+#define rs3(x)    mm128_rol_32( x, 13 ) 
+#define rs4(x)    mm128_rol_32( x, 16 ) 
+#define rs5(x)    mm128_rol_32( x, 19 ) 
+#define rs6(x)    mm128_rol_32( x, 23 ) 
+#define rs7(x)    mm128_rol_32( x, 27 ) 

 #define rol_off_32( M, j, off ) \
-   mm_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
+   mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
                ( ( (j) + (off) ) & 0xF ) + 1 )

 #define add_elt_s( M, H, j ) \
@@ -526,42 +526,42 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
                                     _mm_slli_epi32( qt[23], 2 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
   dH[ 8] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[4], 9 ),
+                 mm128_rol_32( dH[4], 9 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 8 ),
                                _mm_xor_si128( qt[23], qt[ 8] ) ) );
   dH[ 9] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[5], 10 ),
+                 mm128_rol_32( dH[5], 10 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 6 ),
                                _mm_xor_si128( qt[16], qt[ 9] ) ) );
   dH[10] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[6], 11 ),
+                 mm128_rol_32( dH[6], 11 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 6 ),
                                _mm_xor_si128( qt[17], qt[10] ) ) );
   dH[11] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[7], 12 ),
+                 mm128_rol_32( dH[7], 12 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 4 ),
                                _mm_xor_si128( qt[18], qt[11] ) ) );
   dH[12] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[0], 13 ),
+                 mm128_rol_32( dH[0], 13 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 3 ),
                                _mm_xor_si128( qt[19], qt[12] ) ) );
   dH[13] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[1], 14 ),
+                 mm128_rol_32( dH[1], 14 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 4 ),
                                _mm_xor_si128( qt[20], qt[13] ) ) );
   dH[14] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[2], 15 ),
+                 mm128_rol_32( dH[2], 15 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 7 ),
                                _mm_xor_si128( qt[21], qt[14] ) ) );
   dH[15] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[3], 16 ),
+                 mm128_rol_32( dH[3], 16 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 2 ),
                                _mm_xor_si128( qt[22], qt[15] ) ) );
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -92,7 +92,6 @@ int cube_2way_reinit( cube_2way_context *sp )
 {
   memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
   return 0;
-
 }

 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
@@ -123,7 +122,7 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,

 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
 {
-    const int len = size / 16;
+    const int len = size >> 4;
    const __m256i *in = (__m256i*)data;
    int i;

@@ -140,7 +139,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
           sp->pos = 0;
        }
    }
-
    return 0;
 }

@@ -151,25 +149,22 @@ int cube_2way_close( cube_2way_context *sp, void *output )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
-                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+                                _mm256_set_epi32( 0,0,0,0x80,  0,0,0,0x80 ) );
    transform_2way( sp );

-    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
-                                                             1,0,0,0 ) );
-    for ( i = 0; i < 10; ++i )
-       transform_2way( &cube_2way_ctx_cache );
+    sp->h[7] = _mm256_xor_si256( sp->h[7],
+		                 _mm256_set_epi32( 1,0,0,0,  1,0,0,0 ) );

-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->h[i];
+    for ( i = 0; i < 10; ++i )           transform_2way( sp );

+    for ( i = 0; i < sp->hashlen; i++ )  hash[i] = sp->h[i];
    return 0;
 }

 int cube_2way_update_close( cube_2way_context *sp, void *output,
                               const void *data, size_t size )
 {
-    const int len = size / 16;
+    const int len = size >> 4;
    const __m256i *in = (__m256i*)data;
    __m256i *hash = (__m256i*)output;
    int i;
@@ -187,18 +182,15 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
-                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+                    _mm256_set_epi32( 0,0,0,0x80,  0,0,0,0x80 ) );
    transform_2way( sp );

    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
                                                             1,0,0,0 ) );
-    for ( i = 0; i < 10; ++i )
-       transform_2way( &cube_2way_ctx_cache );

-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->h[i];
+    for ( i = 0; i < 10; ++i )            transform_2way( sp );

+    for ( i = 0; i < sp->hashlen; i++ )   hash[i] = sp->h[i];
    return 0;
 }

--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -10,12 +10,12 @@

 struct _cube_2way_context
 {
+    __m256i h[8];
    int hashlen;           // __m128i
    int rounds;
    int blocksize;         // __m128i
    int pos;               // number of __m128i read into x from current block
-    __m256i h[8] __attribute__ ((aligned (64)));
-};
+} __attribute__ ((aligned (64)));

 typedef struct _cube_2way_context cube_2way_context;

--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -254,6 +254,7 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
    transform( sp );

    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+
    transform( sp );
    transform( sp );
    transform( sp );
--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -60,336 +60,174 @@ MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x000
 MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};


-//#include "crypto_hash.h"
-
- int crypto_hash(
-   unsigned char *out,
-   const unsigned char *in,
-   unsigned long long inlen
- )
- {
-
-	 if(hash_echo(512, in, inlen * 8, out) == SUCCESS) 
-		 return 0;
-	 
-	 return -1;
- }
-
-/*
-int main()
-{
-	return 0;
-}
-*/
-
-#if 0
-void DumpState(__m128i *ps)
-{
-	int i, j, k;
-	unsigned int ucol;
-
-	for(j = 0; j < 4; j++)
-	{
-		for(i = 0; i < 4; i++)
-		{
-			printf("row %d,col %d : ", i, j);
-			for(k = 0; k < 4; k++)
-			{
-				ucol = *((int*)ps + 16 * i + 4 * j + k);
-				printf("%02x%02x%02x%02x ", (ucol >> 0) & 0xff, (ucol >> 8) & 0xff, (ucol >> 16) & 0xff, (ucol >> 24) & 0xff);
-			}
-
-			printf("\n");
-		}
-	}
-
-	printf("\n");
-}
-#endif
-
-
-
-
-#ifndef NO_AES_NI
 #define ECHO_SUBBYTES(state, i, j) \
-				state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
-				state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
-				k1 = _mm_add_epi32(k1, M128(const1))
-#else
-#define ECHO_SUBBYTES(state, i, j) \
-				AES_ROUND_VPERM(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				state[i][j] = _mm_xor_si128(state[i][j], k1);\
-				AES_ROUND_VPERM(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				k1 = _mm_add_epi32(k1, M128(const1))
-
-#define ECHO_SUB_AND_MIX(state, i, j, state2, c, r1, r2, r3, r4) \
-				AES_ROUND_VPERM_CORE(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				ktemp = k1;\
-				TRANSFORM(ktemp, _k_ipt, t1, t4);\
-				state[i][j] = _mm_xor_si128(state[i][j], ktemp);\
-				AES_ROUND_VPERM_CORE(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				k1 = _mm_add_epi32(k1, M128(const1));\
-				s1 = state[i][j];\
-				s2 = s1;\
-				TRANSFORM(s2, mul2ipt, t1, t2);\
-				s3 = _mm_xor_si128(s1, s2);\
-				state2[r1][c] = _mm_xor_si128(state2[r1][c], s2);\
-				state2[r2][c] = _mm_xor_si128(state2[r2][c], s1);\
-				state2[r3][c] = _mm_xor_si128(state2[r3][c], s1);\
-				state2[r4][c] = _mm_xor_si128(state2[r4][c], s3)
-
-
-
-#endif
-
+	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
+	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
+	k1 = _mm_add_epi32(k1, M128(const1))

 #define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
-				s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
-				t1 = _mm_srli_epi16(state1[0][j], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = s2;\
-				state2[1][j] = state1[0][j];\
-				state2[2][j] = state1[0][j];\
-				state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
-				s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
-				t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
-				state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
-				state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
-				state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
-				s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
-				t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-				state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
-				state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
-				state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
-				s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
-				t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
-				state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-				state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
-				state2[3][j] = _mm_xor_si128(state2[3][j], s2)
+	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
+	t1 = _mm_srli_epi16(state1[0][j], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = s2;\
+	state2[1][j] = state1[0][j];\
+	state2[2][j] = state1[0][j];\
+	state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
+	s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
+	t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
+	s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
+	t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
+	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
+	t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+	state2[3][j] = _mm_xor_si128(state2[3][j], s2)


 #define ECHO_ROUND_UNROLL2 \
-			ECHO_SUBBYTES(_state, 0, 0);\
-			ECHO_SUBBYTES(_state, 1, 0);\
-			ECHO_SUBBYTES(_state, 2, 0);\
-			ECHO_SUBBYTES(_state, 3, 0);\
-			ECHO_SUBBYTES(_state, 0, 1);\
-			ECHO_SUBBYTES(_state, 1, 1);\
-			ECHO_SUBBYTES(_state, 2, 1);\
-			ECHO_SUBBYTES(_state, 3, 1);\
-			ECHO_SUBBYTES(_state, 0, 2);\
-			ECHO_SUBBYTES(_state, 1, 2);\
-			ECHO_SUBBYTES(_state, 2, 2);\
-			ECHO_SUBBYTES(_state, 3, 2);\
-			ECHO_SUBBYTES(_state, 0, 3);\
-			ECHO_SUBBYTES(_state, 1, 3);\
-			ECHO_SUBBYTES(_state, 2, 3);\
-			ECHO_SUBBYTES(_state, 3, 3);\
-			ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
-			ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
-			ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
-			ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
-			ECHO_SUBBYTES(_state2, 0, 0);\
-			ECHO_SUBBYTES(_state2, 1, 0);\
-			ECHO_SUBBYTES(_state2, 2, 0);\
-			ECHO_SUBBYTES(_state2, 3, 0);\
-			ECHO_SUBBYTES(_state2, 0, 1);\
-			ECHO_SUBBYTES(_state2, 1, 1);\
-			ECHO_SUBBYTES(_state2, 2, 1);\
-			ECHO_SUBBYTES(_state2, 3, 1);\
-			ECHO_SUBBYTES(_state2, 0, 2);\
-			ECHO_SUBBYTES(_state2, 1, 2);\
-			ECHO_SUBBYTES(_state2, 2, 2);\
-			ECHO_SUBBYTES(_state2, 3, 2);\
-			ECHO_SUBBYTES(_state2, 0, 3);\
-			ECHO_SUBBYTES(_state2, 1, 3);\
-			ECHO_SUBBYTES(_state2, 2, 3);\
-			ECHO_SUBBYTES(_state2, 3, 3);\
-			ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
-			ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
-			ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
-			ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+	ECHO_SUBBYTES(_state, 0, 0);\
+	ECHO_SUBBYTES(_state, 1, 0);\
+	ECHO_SUBBYTES(_state, 2, 0);\
+	ECHO_SUBBYTES(_state, 3, 0);\
+	ECHO_SUBBYTES(_state, 0, 1);\
+	ECHO_SUBBYTES(_state, 1, 1);\
+	ECHO_SUBBYTES(_state, 2, 1);\
+	ECHO_SUBBYTES(_state, 3, 1);\
+	ECHO_SUBBYTES(_state, 0, 2);\
+	ECHO_SUBBYTES(_state, 1, 2);\
+	ECHO_SUBBYTES(_state, 2, 2);\
+	ECHO_SUBBYTES(_state, 3, 2);\
+	ECHO_SUBBYTES(_state, 0, 3);\
+	ECHO_SUBBYTES(_state, 1, 3);\
+	ECHO_SUBBYTES(_state, 2, 3);\
+	ECHO_SUBBYTES(_state, 3, 3);\
+	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+	ECHO_SUBBYTES(_state2, 0, 0);\
+	ECHO_SUBBYTES(_state2, 1, 0);\
+	ECHO_SUBBYTES(_state2, 2, 0);\
+	ECHO_SUBBYTES(_state2, 3, 0);\
+	ECHO_SUBBYTES(_state2, 0, 1);\
+	ECHO_SUBBYTES(_state2, 1, 1);\
+	ECHO_SUBBYTES(_state2, 2, 1);\
+	ECHO_SUBBYTES(_state2, 3, 1);\
+	ECHO_SUBBYTES(_state2, 0, 2);\
+	ECHO_SUBBYTES(_state2, 1, 2);\
+	ECHO_SUBBYTES(_state2, 2, 2);\
+	ECHO_SUBBYTES(_state2, 3, 2);\
+	ECHO_SUBBYTES(_state2, 0, 3);\
+	ECHO_SUBBYTES(_state2, 1, 3);\
+	ECHO_SUBBYTES(_state2, 2, 3);\
+	ECHO_SUBBYTES(_state2, 3, 3);\
+	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)



 #define SAVESTATE(dst, src)\
-		dst[0][0] = src[0][0];\
-		dst[0][1] = src[0][1];\
-		dst[0][2] = src[0][2];\
-		dst[0][3] = src[0][3];\
-		dst[1][0] = src[1][0];\
-		dst[1][1] = src[1][1];\
-		dst[1][2] = src[1][2];\
-		dst[1][3] = src[1][3];\
-		dst[2][0] = src[2][0];\
-		dst[2][1] = src[2][1];\
-		dst[2][2] = src[2][2];\
-		dst[2][3] = src[2][3];\
-		dst[3][0] = src[3][0];\
-		dst[3][1] = src[3][1];\
-		dst[3][2] = src[3][2];\
-		dst[3][3] = src[3][3]
+	dst[0][0] = src[0][0];\
+	dst[0][1] = src[0][1];\
+	dst[0][2] = src[0][2];\
+	dst[0][3] = src[0][3];\
+	dst[1][0] = src[1][0];\
+	dst[1][1] = src[1][1];\
+	dst[1][2] = src[1][2];\
+	dst[1][3] = src[1][3];\
+	dst[2][0] = src[2][0];\
+	dst[2][1] = src[2][1];\
+	dst[2][2] = src[2][2];\
+	dst[2][3] = src[2][3];\
+	dst[3][0] = src[3][0];\
+	dst[3][1] = src[3][1];\
+	dst[3][2] = src[3][2];\
+	dst[3][3] = src[3][3]


 void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
 {
-	unsigned int r, b, i, j;
-//      __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
-	__m128i t1, t2, s2, k1;
-	__m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+   unsigned int r, b, i, j;
+   __m128i t1, t2, s2, k1;
+   __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 

+   for(i = 0; i < 4; i++)
+	for(j = 0; j < ctx->uHashSize / 256; j++)
+		_state[i][j] = ctx->state[i][j];

-	for(i = 0; i < 4; i++)
-		for(j = 0; j < ctx->uHashSize / 256; j++)
-			_state[i][j] = ctx->state[i][j];
+   for(b = 0; b < uBlockCount; b++)
+   {
+	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);

-
-#ifdef NO_AES_NI
-	// transform cv
-	for(i = 0; i < 4; i++)
-		for(j = 0; j < ctx->uHashSize / 256; j++)
-		{
-			TRANSFORM(_state[i][j], _k_ipt, t1, t2);
-		}
-#endif
-
-	for(b = 0; b < uBlockCount; b++)
+	// load message
+	for(j = ctx->uHashSize / 256; j < 4; j++)
 	{
-		ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
-
-		// load message
-		for(j = ctx->uHashSize / 256; j < 4; j++)
-		{
-			for(i = 0; i < 4; i++)
-			{
-				_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
-
-#ifdef NO_AES_NI
-				// transform message
-				TRANSFORM(_state[i][j], _k_ipt, t1, t2);
-#endif
-			}
-		}
-
-		// save state
-		SAVESTATE(_statebackup, _state);
-
-
-		k1 = ctx->k;
-
-#ifndef NO_AES_NI
-		for(r = 0; r < ctx->uRounds / 2; r++)
-		{
-			ECHO_ROUND_UNROLL2;
-		}
-
-#else
-		for(r = 0; r < ctx->uRounds / 2; r++)
-		{
-			_state2[0][0] = M128(zero); _state2[1][0] = M128(zero); _state2[2][0] = M128(zero); _state2[3][0] = M128(zero);
-			_state2[0][1] = M128(zero); _state2[1][1] = M128(zero); _state2[2][1] = M128(zero); _state2[3][1] = M128(zero);
-			_state2[0][2] = M128(zero); _state2[1][2] = M128(zero); _state2[2][2] = M128(zero); _state2[3][2] = M128(zero);
-			_state2[0][3] = M128(zero); _state2[1][3] = M128(zero); _state2[2][3] = M128(zero); _state2[3][3] = M128(zero);																			
-
-			ECHO_SUB_AND_MIX(_state, 0, 0, _state2, 0, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 0, _state2, 3, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 0, _state2, 2, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 0, _state2, 1, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state, 0, 1, _state2, 1, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 1, _state2, 0, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 1, _state2, 3, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 1, _state2, 2, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state, 0, 2, _state2, 2, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 2, _state2, 1, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 2, _state2, 0, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 2, _state2, 3, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state, 0, 3, _state2, 3, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 3, _state2, 2, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 3, _state2, 1, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 3, _state2, 0, 3, 0, 1, 2);
-
-			_state[0][0] = M128(zero); _state[1][0] = M128(zero); _state[2][0] = M128(zero); _state[3][0] = M128(zero);
-			_state[0][1] = M128(zero); _state[1][1] = M128(zero); _state[2][1] = M128(zero); _state[3][1] = M128(zero);
-			_state[0][2] = M128(zero); _state[1][2] = M128(zero); _state[2][2] = M128(zero); _state[3][2] = M128(zero);
-			_state[0][3] = M128(zero); _state[1][3] = M128(zero); _state[2][3] = M128(zero); _state[3][3] = M128(zero);																			
-
-			ECHO_SUB_AND_MIX(_state2, 0, 0, _state, 0, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 0, _state, 3, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 0, _state, 2, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 0, _state, 1, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state2, 0, 1, _state, 1, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 1, _state, 0, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 1, _state, 3, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 1, _state, 2, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state2, 0, 2, _state, 2, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 2, _state, 1, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 2, _state, 0, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 2, _state, 3, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state2, 0, 3, _state, 3, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 3, _state, 2, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 3, _state, 1, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 3, _state, 0, 3, 0, 1, 2);
-
-		}
-#endif
-
-		
-		if(ctx->uHashSize == 256)
-		{
-			for(i = 0; i < 4; i++)
-			{
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
-
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
-			}
-		}
-		else
-		{
-			for(i = 0; i < 4; i++)
-			{
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-				_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
-
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-
-				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
-				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
-			}
-		}
-
-		pmsg += ctx->uBlockLength;
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+	   }
 	}

-#ifdef NO_AES_NI
-	// transform state
-	for(i = 0; i < 4; i++)
-		for(j = 0; j < 4; j++)
-		{
-			TRANSFORM(_state[i][j], _k_opt, t1, t2);
-		}
-#endif
+	// save state
+	SAVESTATE(_statebackup, _state);

-		SAVESTATE(ctx->state, _state);
+	k1 = ctx->k;
+
+	for(r = 0; r < ctx->uRounds / 2; r++)
+	{
+		ECHO_ROUND_UNROLL2;
+	}
+		
+	if(ctx->uHashSize == 256)
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+	   }
+	}
+	else
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
+           }
+	}
+	pmsg += ctx->uBlockLength;
+   }
+	SAVESTATE(ctx->state, _state);

 }

--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -30,6 +30,7 @@
 typedef struct
 {
 	__m128i			state[4][4];
+        BitSequence             buffer[192];
 	__m128i			k;
 	__m128i			hashsize;
 	__m128i			const1536;
@@ -39,9 +40,8 @@ typedef struct
 	unsigned int	uBlockLength;
 	unsigned int	uBufferBytes;
 	DataLength		processed_bits;
-	BitSequence		buffer[192];

-} hashState_echo;
+} hashState_echo __attribute__ ((aligned (64)));

 HashReturn init_echo(hashState_echo *state, int hashbitlen);

--- a/algo/echo/sse2/echo.c
+++ b/algo/echo/sse2/echo.c
--- a/algo/echo/sse2/sph_echo.h
+++ b/algo/echo/sse2/sph_echo.h
@@ -1,320 +0,0 @@
-/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * ECHO interface. ECHO is a family of functions which differ by
- * their output size; this implementation defines ECHO for output
- * sizes 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_echo.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_ECHO_H__
-#define SPH_ECHO_H__
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-
-/**
- * Output size (in bits) for ECHO-224.
- */
-#define SPH_SIZE_echo224   224
-
-/**
- * Output size (in bits) for ECHO-256.
- */
-#define SPH_SIZE_echo256   256
-
-/**
- * Output size (in bits) for ECHO-384.
- */
-#define SPH_SIZE_echo384   384
-
-/**
- * Output size (in bits) for ECHO-512.
- */
-#define SPH_SIZE_echo512   512
-
-/**
- * This structure is a context for ECHO computations: it contains the
- * intermediate values and some data from the last entered block. Once
- * an ECHO computation has been performed, the context can be reused for
- * another computation. This specific structure is used for ECHO-224
- * and ECHO-256.
- *
- * The contents of this structure are private. A running ECHO computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[192];    /* first field, for alignment */
-	size_t ptr;
-	union {
-		sph_u32 Vs[4][4];
-#if SPH_64
-		sph_u64 Vb[4][2];
-#endif
-	} u;
-	sph_u32 C0, C1, C2, C3;
-#endif
-} sph_echo_small_context;
-
-/**
- * This structure is a context for ECHO computations: it contains the
- * intermediate values and some data from the last entered block. Once
- * an ECHO computation has been performed, the context can be reused for
- * another computation. This specific structure is used for ECHO-384
- * and ECHO-512.
- *
- * The contents of this structure are private. A running ECHO computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[128];    /* first field, for alignment */
-	size_t ptr;
-	union {
-		sph_u32 Vs[8][4];
-#if SPH_64
-		sph_u64 Vb[8][2];
-#endif
-	} u;
-	sph_u32 C0, C1, C2, C3;
-#endif
-} sph_echo_big_context;
-
-/**
- * Type for a ECHO-224 context (identical to the common "small" context).
- */
-typedef sph_echo_small_context sph_echo224_context;
-
-/**
- * Type for a ECHO-256 context (identical to the common "small" context).
- */
-typedef sph_echo_small_context sph_echo256_context;
-
-/**
- * Type for a ECHO-384 context (identical to the common "big" context).
- */
-typedef sph_echo_big_context sph_echo384_context;
-
-/**
- * Type for a ECHO-512 context (identical to the common "big" context).
- */
-typedef sph_echo_big_context sph_echo512_context;
-
-/**
- * Initialize an ECHO-224 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-224 context (pointer to a
- *             <code>sph_echo224_context</code>)
- */
-void sph_echo224_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-224 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo224(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-224 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (28 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-224 context
- * @param dst   the destination buffer
- */
-void sph_echo224_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (28 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-224 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo224_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Initialize an ECHO-256 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-256 context (pointer to a
- *             <code>sph_echo256_context</code>)
- */
-void sph_echo256_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-256 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo256(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-256 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (32 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-256 context
- * @param dst   the destination buffer
- */
-void sph_echo256_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (32 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-256 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo256_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Initialize an ECHO-384 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-384 context (pointer to a
- *             <code>sph_echo384_context</code>)
- */
-void sph_echo384_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-384 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo384(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-384 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (48 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-384 context
- * @param dst   the destination buffer
- */
-void sph_echo384_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (48 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-384 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo384_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Initialize an ECHO-512 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-512 context (pointer to a
- *             <code>sph_echo512_context</code>)
- */
-void sph_echo512_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-512 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo512(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-512 context
- * @param dst   the destination buffer
- */
-void sph_echo512_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo512_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-	
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
     myrgr_4way_ctx_holder ctx;
     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );

-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -43,12 +43,12 @@ void myriad_4way_hash( void *output, const void *input )
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );

-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     sha256_4way( &ctx.sha, vhash, 64 );
     sha256_4way_close( &ctx.sha, vhash );

-     mm_deinterleave_4x32( output, output+32, output+64, output+96,
+     mm128_deinterleave_4x32( output, output+32, output+64, output+96,
                           vhash, 256 );
 }

@@ -79,7 +79,7 @@ int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   do {
      be32enc( noncep,   n   );
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -83,7 +83,7 @@ extern "C"{
           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
                                         _mm_or_si128( x4, x6 ) ), x5 ) ), \
        _mm_and_si128( x4, \
-           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm_not(x2), x5 ), \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \
                          _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
     _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )

@@ -91,7 +91,7 @@ extern "C"{
 #define F5(x6, x5, x4, x3, x2, x1, x0) \
   _mm_xor_si128( \
       _mm_and_si128( x0, \
-            mm_not( _mm_xor_si128( \
+            mm128_not( _mm_xor_si128( \
                    _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
                                    _mm_and_si128( x2, x5 ) ), \
@@ -136,8 +136,8 @@ extern "C"{
 #define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
 do { \
   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
-   x7 = _mm_add_epi32( _mm_add_epi32( mm_ror_32( t, 7 ), \
-                                      mm_ror_32( x7, 11 ) ), \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
+                                      mm128_ror_32( x7, 11 ) ), \
                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
 } while (0)

--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -10,14 +10,10 @@

 void keccakhash_4way(void *state, const void *input)
 {
-    uint64_t vhash[4*4] __attribute__ ((aligned (64)));
    keccak256_4way_context ctx;
-
    keccak256_4way_init( &ctx );
    keccak256_4way( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, vhash );
-
-    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    keccak256_4way_close( &ctx, state );
 }

 int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -25,6 +21,8 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
@@ -49,13 +47,16 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
 	
      keccakhash_4way( hash, vdata );

-      for ( int i = 0; i < 4; i++ )
-      if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
-           && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
-         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+              pdata[19] = n + lane;
+              nonces[ num_found++ ] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+          }
      }
      n += 4;

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
-                      mm_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
+                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
-                      mm_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
+                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_bswap_32( cast_m128i( data ) ) );
+                      mm128_bswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );

    rnd512( state, zero, zero );

@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
 }
 #endif

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -1,4 +1,4 @@
-#include "allium-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>

@@ -7,7 +7,7 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"

 typedef struct {
@@ -108,7 +108,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256_4way_init( &allium_4way_ctx.blake );
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );

--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -1,22 +0,0 @@
-#include "allium-gate.h"
-
-int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
-
-bool register_allium_algo( algo_gate_t* gate )
-{
-#if defined (ALLIUM_4WAY)
-  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
-  gate->scanhash  = (void*)&scanhash_allium_4way;
-  gate->hash      = (void*)&allium_4way_hash;
-#else
-  gate->miner_thread_init = (void*)&init_allium_ctx;
-  gate->scanhash  = (void*)&scanhash_allium;
-  gate->hash      = (void*)&allium_hash;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
-  gate->set_target        = (void*)&alt_set_target;
-  gate->get_max64         = (void*)&get_max64_0xFFFFLL;
-  return true;
-};
-
-
--- a/algo/lyra2/allium-gate.h
+++ b/algo/lyra2/allium-gate.h
@@ -1,29 +0,0 @@
-#ifndef ALLIUM_GATE_H__
-#define ALLIUM_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-#include "lyra2.h"
-
-#if defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY
-#endif
-
-bool register_allium_algo( algo_gate_t* gate );
-
-#if defined(ALLIUM_4WAY)
-
-void allium_4way_hash( void *state, const void *input );
-int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
-bool init_allium_4way_ctx();
-
-#endif
-
-void allium_hash( void *state, const void *input );
-int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
-bool init_allium_ctx();
-
-#endif
-
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -1,9 +1,9 @@
-#include "allium-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #if defined(__AES__)
 #include "algo/groestl/aes_ni/hash-groestl256.h"
 #else
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -0,0 +1,178 @@
+#include "lyra2-gate.h"
+
+
+__thread uint64_t* l2v3_wholeMatrix;
+
+bool lyra2rev3_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v3_wholeMatrix = _mm_malloc( size, 64 );
+#if defined (LYRA2REV3_4WAY)
+   init_lyra2rev3_4way_ctx();;
+#else
+   init_lyra2rev3_ctx();
+#endif
+   return l2v3_wholeMatrix;
+}
+
+bool register_lyra2rev3_algo( algo_gate_t* gate )
+{
+#if defined (LYRA2REV3_4WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
+  gate->hash      = (void*)&lyra2rev3_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_lyra2rev3;
+  gate->hash      = (void*)&lyra2rev3_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
+  gate->set_target        = (void*)&alt_set_target;
+  return true;
+};
+
+//////////////////////////////////
+
+__thread uint64_t* l2v2_wholeMatrix;
+
+bool lyra2rev2_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
+#if defined (LYRA2REV2_4WAY)
+   init_lyra2rev2_4way_ctx();;
+#else
+   init_lyra2rev2_ctx();
+#endif
+   return l2v2_wholeMatrix;
+}
+
+bool register_lyra2rev2_algo( algo_gate_t* gate )
+{
+#if defined (LYRA2REV2_4WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
+  gate->hash      = (void*)&lyra2rev2_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_lyra2rev2;
+  gate->hash      = (void*)&lyra2rev2_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
+  gate->set_target        = (void*)&alt_set_target;
+  return true;
+};
+
+/////////////////////////////
+
+bool register_lyra2z_algo( algo_gate_t* gate )
+{
+#if defined(LYRA2Z_8WAY)
+  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
+  gate->hash       = (void*)&lyra2z_8way_hash;
+#elif defined(LYRA2Z_4WAY)
+  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
+  gate->hash       = (void*)&lyra2z_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2z_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z;
+  gate->hash       = (void*)&lyra2z_hash;
+#endif
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&alt_set_target;
+  return true;
+};
+
+
+////////////////////////
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2H_4WAY
+  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
+  gate->hash       = (void*)&lyra2h_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+#endif
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&alt_set_target;
+  return true;
+};
+
+/////////////////////////////////
+
+int64_t allium_get_max64_0xFFFFLL() { return 0xFFFFLL; }
+
+bool register_allium_algo( algo_gate_t* gate )
+{
+#if defined (ALLIUM_4WAY)
+  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_4way;
+  gate->hash      = (void*)&allium_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&init_allium_ctx;
+  gate->scanhash  = (void*)&scanhash_allium;
+  gate->hash      = (void*)&allium_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->set_target        = (void*)&alt_set_target;
+  gate->get_max64         = (void*)&allium_get_max64_0xFFFFLL;
+  return true;
+};
+
+/////////////////////////////////////////
+
+bool phi2_has_roots;
+bool phi2_use_roots = false;
+
+int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
+
+void phi2_decode_extra_data( struct work *work )
+{
+   if ( work->data[0] & ( 1<<30 ) ) phi2_use_roots = true;
+   else for ( int i = 20; i < 36; i++ )
+   {
+      if (work->data[i]) { phi2_use_roots = true; break; }
+   }
+}
+
+void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   uchar merkle_tree[64] = { 0 };
+   size_t t;
+
+   algo_gate.gen_merkle_root( merkle_tree, sctx );
+   // Increment extranonce2
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+   // Assemble block header
+   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
+                  (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
+                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
+   for ( t = 0; t < 16; t++ )
+      g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
+}
+
+
+bool register_phi2_algo( algo_gate_t* gate )
+{
+   init_phi2_ctx();
+   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
+   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
+   gate->build_extraheader  = (void*)&phi2_build_extraheader;
+   gate->set_target         = (void*)&alt_set_target; 
+   gate->get_max64          = (void*)&get_max64_0xffffLL;
+   gate->scanhash           = (void*)&scanhash_phi2;
+   return true;
+}
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -0,0 +1,154 @@
+#ifndef LYRA2_GATE_H__
+#define LYRA2_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(__AVX2__)
+  #define LYRA2REV3_4WAY
+#endif
+
+extern __thread uint64_t* l2v3_wholeMatrix;
+
+bool register_lyra2rev3_algo( algo_gate_t* gate );
+
+#if defined(LYRA2REV3_4WAY)
+
+void lyra2rev3_4way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done );
+bool init_lyra2rev3_4way_ctx();
+
+#else
+
+void lyra2rev3_hash( void *state, const void *input );
+int scanhash_lyra2rev3( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done );
+bool init_lyra2rev3_ctx();
+
+#endif
+
+//////////////////////////////////
+
+#if defined(__AVX2__)
+  #define LYRA2REV2_4WAY
+#endif
+
+extern __thread uint64_t* l2v2_wholeMatrix;
+
+bool register_lyra2rev2_algo( algo_gate_t* gate );
+
+#if defined(LYRA2REV2_4WAY)
+
+void lyra2rev2_4way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done );
+bool init_lyra2rev2_4way_ctx();
+
+#else
+
+void lyra2rev2_hash( void *state, const void *input );
+int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done );
+bool init_lyra2rev2_ctx();
+
+#endif
+
+/////////////////////////
+
+#if defined(__SSE4_2__)
+  #define LYRA2Z_4WAY
+#endif
+#if defined(__AVX2__)
+//  #define LYRA2Z_8WAY
+#endif
+
+
+#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
+
+#if defined(LYRA2Z_8WAY)
+
+void lyra2z_8way_hash( void *state, const void *input );
+int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool lyra2z_8way_thread_init();
+
+#elif defined(LYRA2Z_4WAY)
+
+void lyra2z_4way_hash( void *state, const void *input );
+int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool lyra2z_4way_thread_init();
+
+#else
+
+void lyra2z_hash( void *state, const void *input );
+int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool lyra2z_thread_init();
+
+#endif
+
+////////////////////
+
+#if defined(__AVX2__)
+  #define LYRA2H_4WAY
+#endif
+
+#define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
+
+#if defined(LYRA2H_4WAY)
+
+void lyra2h_4way_hash( void *state, const void *input );
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool lyra2h_4way_thread_init();
+
+#else
+
+void lyra2h_hash( void *state, const void *input );
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool lyra2h_thread_init();
+
+#endif
+
+//////////////////////////////////
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY
+#endif
+
+bool register_allium_algo( algo_gate_t* gate );
+
+#if defined(ALLIUM_4WAY)
+
+void allium_4way_hash( void *state, const void *input );
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool init_allium_4way_ctx();
+
+#else
+
+void allium_hash( void *state, const void *input );
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool init_allium_ctx();
+
+#endif 
+
+/////////////////////////////////////////
+
+bool phi2_has_roots;
+
+bool register_phi2_algo( algo_gate_t* gate );
+
+void phi2_hash( void *state, const void *input );
+int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+void init_phi2_ctx();
+
+#endif  // LYRA2_GATE_H__
+
+
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -211,6 +211,186 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   return 0;
 }

+/////////////////////////////////////////////////
+
+int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+               const uint64_t timeCost, const uint64_t nRows,
+               const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   uint64_t instance = 0;
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+/*
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+*/
+
+   uint64_t *ptrWord = wholeMatrix;
+
+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+// from here on it's all simd acces to state and matrix
+// define vector pointers and adjust sizes and pointer offsets
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+
+   ptrWord = wholeMatrix;
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+//       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+             instance = state[instance & 0xF];
+             rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
+//           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
+
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   return 0;
+}
+
+
+
+//////////////////////////////////////////////////
 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
            const uint64_t timeCost, const uint64_t nRows,
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -50,6 +50,10 @@ int LYRA2REV2( uint64_t*, void *K, uint64_t kLen, const void *pwd,
               uint64_t pwdlen, const void *salt, uint64_t saltlen,
               uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2REV3( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+               uint64_t pwdlen, const void *salt, uint64_t saltlen,
+               uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
 int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
            uint64_t pwdlen, const void *salt, uint64_t saltlen,
            uint64_t timeCost, uint64_t nRows, uint64_t nCols );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -1,4 +1,4 @@
-#include "lyra2h-gate.h"
+#include "lyra2-gate.h"

 #ifdef LYRA2H_4WAY

@@ -36,7 +36,7 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
@@ -70,7 +70,7 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   for ( int i=0; i < 20; i++ )
      be32enc( &edata[i], pdata[i] );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   lyra2h_4way_midstate( vdata );

--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -1,25 +0,0 @@
-#include "lyra2h-gate.h"
-#include "lyra2.h"
-
-void lyra2h_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_lyra2h_algo( algo_gate_t* gate )
-{
-#ifdef LYRA2H_4WAY
-  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
-  gate->hash       = (void*)&lyra2h_4way_hash;
-#else
-  gate->miner_thread_init = (void*)&lyra2h_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2h;
-  gate->hash       = (void*)&lyra2h_hash;
-#endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2h_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2h-gate.h
+++ b/algo/lyra2/lyra2h-gate.h
@@ -1,32 +0,0 @@
-#ifndef LYRA2H_GATE_H__
-#define LYRA2H_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(__AVX2__)
-  #define LYRA2H_4WAY
-#endif
-
-#define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
-
-#if defined(LYRA2H_4WAY)
-
-void lyra2h_4way_hash( void *state, const void *input );
-
-int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool lyra2h_4way_thread_init();
-
-#endif
-
-void lyra2h_hash( void *state, const void *input );
-
-int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
-bool lyra2h_thread_init();
-
-#endif
-
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -1,4 +1,4 @@
-#include "lyra2h-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,13 +1,13 @@
-#include "lyra2rev2-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>

-#if defined (__AVX2__)	
+#if defined (LYRA2REV2_4WAY)	

 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 

 typedef struct {
   blake256_4way_context     blake;
@@ -74,11 +74,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-   mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, vhash );

-   mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -101,7 +101,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,

   swab32_array( edata, pdata, 20 );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -1,40 +0,0 @@
-#include "lyra2rev2-gate.h"
-
-__thread uint64_t* l2v2_wholeMatrix;
-
-void lyra2rev2_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool lyra2rev2_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-   l2v2_wholeMatrix = _mm_malloc( i, 64 );
-#if defined (LYRA2REV2_4WAY)
-   init_lyra2rev2_4way_ctx();;
-#else
-   init_lyra2rev2_ctx();
-#endif
-   return l2v2_wholeMatrix;
-}
-
-bool register_lyra2rev2_algo( algo_gate_t* gate )
-{
-#if defined (LYRA2REV2_4WAY)
-  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
-  gate->hash      = (void*)&lyra2rev2_4way_hash;
-#else
-  gate->scanhash  = (void*)&scanhash_lyra2rev2;
-  gate->hash      = (void*)&lyra2rev2_hash;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
-  gate->set_target        = (void*)&lyra2rev2_set_target;
-  return true;
-};
-
-
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -1,35 +0,0 @@
-#ifndef LYRA2REV2_GATE_H__
-#define LYRA2REV2_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-#include "lyra2.h"
-
-#if defined(__AVX2__)
-  #define LYRA2REV2_4WAY
-#endif
-
-extern __thread uint64_t* l2v2_wholeMatrix;
-
-bool register_lyra2rev2_algo( algo_gate_t* gate );
-
-#if defined(LYRA2REV2_4WAY)
-
-void lyra2rev2_4way_hash( void *state, const void *input );
-
-int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool init_lyra2rev2_4way_ctx();
-
-#endif
-
-void lyra2rev2_hash( void *state, const void *input );
-
-int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
-bool init_lyra2rev2_ctx();
-
-#endif
-
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,11 +1,11 @@
-#include "lyra2rev2-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 //#include "lyra2.h"

 typedef struct {
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -0,0 +1,110 @@
+#include "lyra2-gate.h"
+#include <memory.h>
+
+#if defined (LYRA2REV3_4WAY)	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/cubehash/cubehash_sse2.h" 
+
+typedef struct {
+   blake256_4way_context     blake;
+   cubehashParam             cube;
+   bmw256_4way_context       bmw;
+} lyra2v3_4way_ctx_holder;
+
+static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
+
+bool init_lyra2rev3_4way_ctx()
+{
+   blake256_4way_init( &l2v3_4way_ctx.blake );
+   cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
+   bmw256_4way_init( &l2v3_4way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_4way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
+
+   blake256_4way( &ctx.blake, input, 80 );
+   blake256_4way_close( &ctx.blake, vhash );
+   mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+   
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+
+   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, vhash );
+
+   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76; // 19*4
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+
+      lyra2rev3_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;         
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -0,0 +1,102 @@
+#include "lyra2-gate.h"
+#include <memory.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/cubehash/cubehash_sse2.h" 
+//#include "lyra2.h"
+
+typedef struct {
+        cubehashParam           cube;
+//        cubehashParam           cube2;
+        sph_blake256_context     blake;
+        sph_bmw256_context       bmw;
+
+} lyra2v3_ctx_holder;
+
+static lyra2v3_ctx_holder lyra2v3_ctx;
+static __thread sph_blake256_context l2v3_blake_mid;
+
+bool init_lyra2rev3_ctx()
+{
+        cubehashInit( &lyra2v3_ctx.cube, 256, 16, 32 );
+//        cubehashInit( &lyra2v3_ctx.cube2, 256, 16, 32 );
+        sph_blake256_init( &lyra2v3_ctx.blake );
+        sph_bmw256_init( &lyra2v3_ctx.bmw );
+        return true;
+}
+
+void l2v3_blake256_midstate( const void* input )
+{
+    memcpy( &l2v3_blake_mid, &lyra2v3_ctx.blake, sizeof l2v3_blake_mid );
+    sph_blake256( &l2v3_blake_mid, input, 64 );
+}
+
+void lyra2rev3_hash( void *state, const void *input )
+{
+        lyra2v3_ctx_holder ctx __attribute__ ((aligned (64))); 
+        memcpy( &ctx, &lyra2v3_ctx, sizeof(lyra2v3_ctx) );
+        uint8_t hash[128] __attribute__ ((aligned (64)));
+        #define hashA hash
+        #define hashB hash+64
+        const int midlen = 64;            // bytes
+        const int tail   = 80 - midlen;   // 16
+
+        memcpy( &ctx.blake, &l2v3_blake_mid, sizeof l2v3_blake_mid );
+	sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
+	sph_blake256_close( &ctx.blake, hash );
+
+        LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
+
+        cubehashUpdateDigest( &ctx.cube, (byte*) hashA,
+                              (const byte*) hash, 32 );
+
+	LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
+
+	sph_bmw256( &ctx.bmw, hash, 32 );
+	sph_bmw256_close( &ctx.bmw, hash );
+
+	memcpy( state, hash, 32 );
+}
+
+int scanhash_lyra2rev3(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t endiandata[20] __attribute__ ((aligned (64)));
+        uint32_t hash[8] __attribute__((aligned(64)));
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+        const uint32_t Htarg = ptarget[7];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+        swab32_array( endiandata, pdata, 20 );
+
+        l2v3_blake256_midstate( endiandata );
+
+	do {
+		be32enc(&endiandata[19], nonce);
+		lyra2rev3_hash(hash, endiandata);
+
+		if (hash[7] <= Htarg )
+                {
+                   if( fulltest(hash, ptarget) )
+                   {
+			pdata[19] = nonce;
+                        work_set_target_ratio( work, hash );
+			*hashes_done = pdata[19] - first_nonce;
+		   	return 1;
+		   }
+                }
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -1,4 +1,4 @@
-#include "lyra2z-gate.h"
+#include "lyra2-gate.h"

 #ifdef LYRA2Z_4WAY

@@ -36,7 +36,7 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -70,7 +70,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   for ( int i=0; i < 20; i++ )
      be32enc( &edata[i], pdata[i] );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   lyra2z_4way_midstate( vdata );

--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -1,29 +0,0 @@
-#include "lyra2z-gate.h"
-#include "lyra2.h"
-
-void lyra2z_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_lyra2z_algo( algo_gate_t* gate )
-{
-#if defined(LYRA2Z_8WAY)
-  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
-  gate->hash       = (void*)&lyra2z_8way_hash;
-#elif defined(LYRA2Z_4WAY)
-  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
-  gate->hash       = (void*)&lyra2z_4way_hash;
-#else
-  gate->miner_thread_init = (void*)&lyra2z_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z;
-  gate->hash       = (void*)&lyra2z_hash;
-#endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2z_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -1,46 +0,0 @@
-#ifndef LYRA2Z_GATE_H__
-#define LYRA2Z_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(__SSE4_2__)
-  #define LYRA2Z_4WAY
-#endif
-#if defined(__AVX2__)
-//  #define LYRA2Z_8WAY
-#endif
-
-
-#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
-
-#if defined(LYRA2Z_8WAY)
-
-void lyra2z_8way_hash( void *state, const void *input );
-
-int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool lyra2z_8way_thread_init();
-
-#elif defined(LYRA2Z_4WAY)
-
-void lyra2z_4way_hash( void *state, const void *input );
-
-int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool lyra2z_4way_thread_init();
-
-#else
-
-void lyra2z_hash( void *state, const void *input );
-
-int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
-bool lyra2z_thread_init();
-
-#endif
-
-#endif
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -1,6 +1,6 @@
 #include <memory.h>
 #include <mm_malloc.h>
-#include "lyra2z-gate.h"
+#include "lyra2-gate.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "avxdefs.h"
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -0,0 +1,133 @@
+/**
+ * Phi-2 algo Implementation
+ */
+
+#include "lyra2-gate.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#ifdef __AES__
+  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/echo/sph_echo.h"
+#endif
+
+typedef struct {
+     cubehashParam           cube;
+     sph_jh512_context       jh;
+#if  defined(__AES__)
+     hashState_echo          echo1;
+     hashState_echo          echo2;
+#else
+     sph_echo512_context     echo1;
+     sph_echo512_context     echo2;
+#endif
+     sph_gost512_context     gost;
+     sph_skein512_context    skein;
+} phi2_ctx_holder;
+
+phi2_ctx_holder phi2_ctx;
+
+void init_phi2_ctx()
+{
+   cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
+   sph_jh512_init(&phi2_ctx.jh);
+#if defined(__AES__)
+   init_echo( &phi2_ctx.echo1, 512 );
+   init_echo( &phi2_ctx.echo2, 512 );
+#else
+   sph_echo512_init(&phi2_ctx.echo1);
+   sph_echo512_init(&phi2_ctx.echo2);
+#endif
+   sph_gost512_init(&phi2_ctx.gost);
+   sph_skein512_init(&phi2_ctx.skein);
+};
+
+void phi2_hash(void *state, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+	unsigned char _ALIGN(128) hashA[64];
+	unsigned char _ALIGN(128) hashB[64];
+
+        phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+        memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+
+        cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
+		              phi2_has_roots ? 144 : 80 );
+
+	LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
+	LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
+
+	sph_jh512( &ctx.jh, (const void*)hashA, 64 );
+	sph_jh512_close( &ctx.jh, (void*)hash );
+
+	if ( hash[0] & 1 )
+       	{
+           sph_gost512( &ctx.gost, (const void*)hash, 64 );
+	   sph_gost512_close( &ctx.gost, (void*)hash );
+	}
+       	else
+       	{
+#if defined(__AES__)
+           update_final_echo ( &ctx.echo1, (BitSequence *)hash,
+                               (const BitSequence *)hash, 512 );
+           update_final_echo ( &ctx.echo2, (BitSequence *)hash,
+                               (const BitSequence *)hash, 512 );
+#else
+	   sph_echo512( &ctx.echo1, (const void*)hash, 64 );
+	   sph_echo512_close( &ctx.echo1, (void*)hash );
+
+	   sph_echo512( &ctx.echo2, (const void*)hash, 64 );
+	   sph_echo512_close( &ctx.echo2, (void*)hash );
+#endif
+	}
+
+	sph_skein512( &ctx.skein, (const void*)hash, 64 );
+	sph_skein512_close( &ctx.skein, (void*)hash );
+
+	for (int i=0; i<4; i++)
+		((uint64_t*)hash)[i] ^= ((uint64_t*)hash)[i+4];
+
+	memcpy(state, hash, 32);
+}
+
+int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(128) hash[8];
+	uint32_t _ALIGN(128) endiandata[36];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t n = first_nonce;
+
+	if(opt_benchmark){
+		ptarget[7] = 0x00ff;
+	}
+
+	phi2_has_roots = false;
+	for (int i=0; i < 36; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+		if (i >= 20 && pdata[i]) phi2_has_roots = true;
+	}
+
+	do {
+		be32enc(&endiandata[19], n);
+		phi2_hash(hash, endiandata);
+
+		if (hash[7] < Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return 1;
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -48,6 +48,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    return ( w >> c ) | ( w << ( 64 - c ) );
 }

+// serial data is only 32 bytes so AVX2 is the limit for that dimension.
+// However, 2 way parallel looks trivial to code for AVX512 except for
+// a data dependency with rowa.
+
 #if defined __AVX2__
 // only available with avx2

@@ -65,13 +69,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_ror256_1x64( s1); \
+   s1 = mm256_ror_1x64( s1); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rol256_1x64( s3 ); \
+   s3 = mm256_rol_1x64( s3 ); \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rol256_1x64( s1 ); \
+   s1 = mm256_rol_1x64( s1 ); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_ror256_1x64( s3 );
+   s3 = mm256_ror_1x64( s3 );

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -93,25 +97,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
   a = _mm_add_epi64( a, b ); \
-   d = mm_ror_64( _mm_xor_si128( d, a), 32 ); \
+   d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm_ror_64( _mm_xor_si128( b, c ), 24 ); \
+   b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
   a = _mm_add_epi64( a, b ); \
-   d = mm_ror_64( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm_ror_64( _mm_xor_si128( b, c ), 63 );
+   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm_ror256_1x64( s2, s3 ); \
-   mm_swap_128( s4, s5 ); \
-   mm_rol256_1x64( s6, s7 ); \
+   mm128_ror256_1x64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_rol256_1x64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm_rol256_1x64( s2, s3 ); \
-   mm_swap_128( s4, s5 ); \
-   mm_ror256_1x64( s6, s7 );
+   mm128_rol256_1x64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_ror256_1x64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
@@ -1080,6 +1080,8 @@ void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
   }
 }

+int neoscrypt_get_work_data_size () { return 80; }
+
 bool register_neoscrypt_algo( algo_gate_t* gate )
 {
  gate->optimizations         = SSE2_OPT;
@@ -1092,7 +1094,7 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
-  gate->work_data_size        = 80;
+  gate->get_work_data_size    = (void*)&neoscrypt_get_work_data_size;
  return true;
 };

--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -62,15 +62,15 @@ void nist5hash_4way( void *out, const void *input )

     skein512_4way_init( &ctx_skein );
     skein512_4way( &ctx_skein, vhash, 64 );
-     skein512_4way_close( &ctx_skein, vhash );
-
-     mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
+     skein512_4way_close( &ctx_skein, out );
 }

 int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done)
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[25]);
+     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
@@ -120,15 +120,16 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,

              nist5hash_4way( hash, vdata );

-              pdata[19] = n;
-
-              for ( int i = 0; i < 4; i++ )
-              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
-                 && fulltest( hash+(i<<3), ptarget ) )
+              for ( int lane = 0; lane < 4; lane++ )
+              if ( ( hash7[ lane ] & mask ) == 0 )
              {
-                 pdata[19] = n+i;         
-                 nonces[ num_found++ ] = n+i;
-                 work_set_target_ratio( work, hash+(i<<3) );
+                 mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+                 if ( fulltest( lane_hash, ptarget ) )
+                 {
+                    pdata[19] = n + lane;
+                    nonces[ num_found++ ] = n + lane;
+                    work_set_target_ratio( work, lane_hash );
+                 }
              }
              n += 4;
           } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -219,6 +219,8 @@ void zr5_display_pok( struct work* work )
        applog(LOG_BLUE, "POK received: %08xx", work->data[0] );
 }

+int zr5_get_work_data_size() { return 80; }
+
 bool register_zr5_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AES_OPT;
@@ -227,12 +229,12 @@ bool register_zr5_algo( algo_gate_t* gate )
    gate->scanhash              = (void*)&scanhash_zr5;
    gate->hash                  = (void*)&zr5hash;
    gate->get_max64             = (void*)&zr5_get_max64;
-    gate->display_extra_data    = (void*)&zr5_display_pok;
+    gate->decode_extra_data     = (void*)&zr5_display_pok;
    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
    gate->work_decode           = (void*)&std_be_work_decode;
    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
-    gate->work_data_size        = 80;
+    gate->get_work_data_size    = (void*)&zr5_get_work_data_size;
    gate->work_cmp_size         = 72;
    return true;
 };
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"

--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
 #else
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #ifndef NO_AES_NI
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -184,7 +184,8 @@ void lbry_4way_hash( void* output, const void* input )
   sha256_4way( &ctx_sha256, vhashA, 32 );
   sha256_4way_close( &ctx_sha256, vhashA );

-   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhashA, 256 );
+   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
+		            vhashA, 256 );
 }

 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -209,7 +210,7 @@ int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,

   // we need bigendian data...
   swab32_array( edata, pdata, 32 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
   sha256_4way_init( &sha256_mid );
   sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );

--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -41,8 +41,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

-// don't use lbry_build_block_header, it can't handle clasim, do it inline
-// in lbry_build_extraheader. The side effect is no gbt support for lbry.
 void lbry_build_block_header( struct work* g_work, uint32_t version,
                             uint32_t *prevhash, uint32_t *merkle_root,
                             uint32_t ntime, uint32_t nbits )
@@ -61,9 +59,6 @@ void lbry_build_block_header( struct work* g_work, uint32_t version,
   for ( i = 0; i < 8; i++ )
      g_work->data[9 + i] = be32dec( merkle_root + i );

-//   for ( int i = 0; i < 8; i++ )
-//        g_work->data[17 + i] = claim[i];
-
   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
   g_work->data[28] = 0x80000000;
@@ -80,10 +75,6 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
   // Assemble block header 

-//   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
-//          (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_root,
-//          le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
-
   memset( g_work->data, 0, sizeof(g_work->data) );
   g_work->data[0] = le32dec( sctx->job.version );

@@ -94,7 +85,7 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );

   for ( int i = 0; i < 8; i++ )
-        g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+        g_work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i];

   g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
   g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
@@ -108,6 +99,8 @@ void lbry_set_target( struct work* work, double job_diff )

 int64_t lbry_get_max64() { return 0x1ffffLL; }

+int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
+
 bool register_lbry_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | SHA_OPT;
@@ -130,7 +123,7 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
  gate->nonce_index           = LBRY_NONCE_INDEX;
-  gate->work_data_size        = LBRY_WORK_DATA_SIZE;
+  gate->get_work_data_size    = (void*)&lbry_get_work_data_size;
  return true;
 }

--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -32,20 +32,20 @@ static const uint32_t IV[5] =
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )

 #define F3(x, y, z) \
-   _mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
+   _mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )

 #define F4(x, y, z) \
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )

 #define F5(x, y, z) \
-   _mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
+   _mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )

 #define RR(a, b, c, d, e, f, s, r, k) \
 do{ \
-   a = _mm_add_epi32( mm_rol_32( _mm_add_epi32( _mm_add_epi32( \
+   a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
                                 _mm_set1_epi32( k ) ), s ), e ); \
-   c = mm_rol_32( c, 10 );\
+   c = mm128_rol_32( c, 10 );\
 } while (0)

 #define ROUND1(a, b, c, d, e, f, s, r, k)  \
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -98,19 +98,19 @@ static const sph_u32 K256[64] = {

 #define BSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x,  2), mm_ror_32(x, 13) ), mm_ror_32( x, 22) )
+        mm128_ror_32(x,  2), mm128_ror_32(x, 13) ), mm128_ror_32( x, 22) )

 #define BSG2_1(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x,  6), mm_ror_32(x, 11) ), mm_ror_32( x, 25) )
+        mm128_ror_32(x,  6), mm128_ror_32(x, 11) ), mm128_ror_32( x, 25) )

 #define SSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x,  7), mm_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) 
+        mm128_ror_32(x,  7), mm128_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) 

 #define SSG2_1(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x, 17), mm_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
+        mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )

 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
@@ -129,22 +129,22 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
   register  __m128i A, B, C, D, E, F, G, H;
   __m128i W[16];

-   W[ 0] = mm_bswap_32( in[ 0] );
-   W[ 1] = mm_bswap_32( in[ 1] );
-   W[ 2] = mm_bswap_32( in[ 2] );
-   W[ 3] = mm_bswap_32( in[ 3] );
-   W[ 4] = mm_bswap_32( in[ 4] );
-   W[ 5] = mm_bswap_32( in[ 5] );
-   W[ 6] = mm_bswap_32( in[ 6] );
-   W[ 7] = mm_bswap_32( in[ 7] );
-   W[ 8] = mm_bswap_32( in[ 8] );
-   W[ 9] = mm_bswap_32( in[ 9] );
-   W[10] = mm_bswap_32( in[10] );
-   W[11] = mm_bswap_32( in[11] );
-   W[12] = mm_bswap_32( in[12] );
-   W[13] = mm_bswap_32( in[13] );
-   W[14] = mm_bswap_32( in[14] );
-   W[15] = mm_bswap_32( in[15] );
+   W[ 0] = mm128_bswap_32( in[ 0] );
+   W[ 1] = mm128_bswap_32( in[ 1] );
+   W[ 2] = mm128_bswap_32( in[ 2] );
+   W[ 3] = mm128_bswap_32( in[ 3] );
+   W[ 4] = mm128_bswap_32( in[ 4] );
+   W[ 5] = mm128_bswap_32( in[ 5] );
+   W[ 6] = mm128_bswap_32( in[ 6] );
+   W[ 7] = mm128_bswap_32( in[ 7] );
+   W[ 8] = mm128_bswap_32( in[ 8] );
+   W[ 9] = mm128_bswap_32( in[ 9] );
+   W[10] = mm128_bswap_32( in[10] );
+   W[11] = mm128_bswap_32( in[11] );
+   W[12] = mm128_bswap_32( in[12] );
+   W[13] = mm128_bswap_32( in[13] );
+   W[14] = mm128_bswap_32( in[14] );
+   W[15] = mm128_bswap_32( in[15] );

   A = r[0];
   B = r[1];
@@ -289,13 +289,13 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    low = low << 3;

    sc->buf[ pad >> 2 ] =
-                 mm_bswap_32( _mm_set1_epi32( high ) );
+                 mm128_bswap_32( _mm_set1_epi32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm_bswap_32( _mm_set1_epi32( low ) );
+                 mm128_bswap_32( _mm_set1_epi32( low ) );
    sha256_4way_round( sc->buf, sc->val );

    for ( u = 0; u < 8; u ++ )
-       ((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
+       ((__m128i*)dst)[u] = mm128_bswap_32( sc->val[u] );
 }

 #if defined(__AVX2__)
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha2-hash-4way.h"
-//#include <openssl/sha.h>

 #if defined(SHA256T_8WAY)

@@ -25,11 +24,8 @@ void sha256t_8way_hash( void* output, const void* input )

   sha256_8way_init( &ctx );
   sha256_8way( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8way_close( &ctx, output );

-   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
-                            output+128, output+160, output+192, output+224,
-                            vhash, 256 );
 }

 int scanhash_sha256t_8way( int thr_id, struct work *work,
@@ -84,14 +80,22 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,

         sha256t_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )
-         if ( ( !( ( hash+(i<<3) )[7] & mask ) )
-              && fulltest( hash+(i<<3), ptarget ) )
-         {
-            pdata[19] = n+i;
-            nonces[ num_found++ ] = n+i;
-            work_set_target_ratio( work, hash+(i<<3) );
-         }
+         uint32_t *hash7 = &(hash[7<<3]); 
+	 
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         { 
+            // deinterleave hash for lane
+	    uint32_t lane_hash[8];
+	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+
+	    if ( fulltest( lane_hash, ptarget ) )
+            {
+	       pdata[19] = n + lane;
+               nonces[ num_found++ ] = n + lane;
+               work_set_target_ratio( work, lane_hash );
+	    }
+	 }
         n += 8;

      } while ( (num_found == 0) && (n < max_nonce)
@@ -122,10 +126,8 @@ void sha256t_4way_hash( void* output, const void* input )

   sha256_4way_init( &ctx );
   sha256_4way( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4way_close( &ctx, output );

-   mm_deinterleave_4x32( output,     output+ 32, output+ 64, output+ 96,
-                         vhash, 256 );
 }

 int scanhash_sha256t_4way( int thr_id, struct work *work,
@@ -133,6 +135,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
   uint32_t edata[20] __attribute__ ((aligned (32)));;
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -159,7 +163,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
   for ( int k = 0; k < 19; k++ )
      be32enc( &edata[k], pdata[k] );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   sha256_4way_init( &sha256_ctx4 );
   sha256_4way( &sha256_ctx4, vdata, 64 );

@@ -175,15 +179,20 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,

         sha256t_4way_hash( hash, vdata );

-         for ( int i = 0; i < 4; i++ )
-         if ( ( !( ( hash+(i<<3) )[7] & mask ) )
-              && fulltest( hash+(i<<3), ptarget ) )
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
         {
-            pdata[19] = n+i;
-            nonces[ num_found++ ] = n+i;
-            work_set_target_ratio( work, hash+(i<<3) );
+            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+            if ( fulltest( lane_hash, ptarget ) )
+            {
+               pdata[19] = n + lane;
+               nonces[ num_found++ ] = n + lane;
+               work_set_target_ratio( work, lane_hash );
+            }
         }
-         n += 4;
+
+	 n += 4;

      } while ( (num_found == 0) && (n < max_nonce)
                && !work_restart[thr_id].restart );
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -3,16 +3,18 @@
 bool register_sha256t_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
+    gate->optimizations = SSE42_OPT | AVX2_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
 #elif defined(SHA256T_4WAY)
+    gate->optimizations = SSE42_OPT | AVX2_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
    gate->hash       = (void*)&sha256t_4way_hash;
 #else
+    gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
 #endif
-    gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
    gate->get_max64  = (void*)&get_max64_0x3ffff;
    return true;
 }
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -248,22 +248,22 @@ do { \
 */
 #define SWAP_BC \
 do { \
-    mm_swap_128( B0, C0 ); \
-    mm_swap_128( B1, C1 ); \
-    mm_swap_128( B2, C2 ); \
-    mm_swap_128( B3, C3 ); \
-    mm_swap_128( B4, C4 ); \
-    mm_swap_128( B5, C5 ); \
-    mm_swap_128( B6, C6 ); \
-    mm_swap_128( B7, C7 ); \
-    mm_swap_128( B8, C8 ); \
-    mm_swap_128( B9, C9 ); \
-    mm_swap_128( BA, CA ); \
-    mm_swap_128( BB, CB ); \
-    mm_swap_128( BC, CC ); \
-    mm_swap_128( BD, CD ); \
-    mm_swap_128( BE, CE ); \
-    mm_swap_128( BF, CF ); \
+    mm128_swap256_128( B0, C0 ); \
+    mm128_swap256_128( B1, C1 ); \
+    mm128_swap256_128( B2, C2 ); \
+    mm128_swap256_128( B3, C3 ); \
+    mm128_swap256_128( B4, C4 ); \
+    mm128_swap256_128( B5, C5 ); \
+    mm128_swap256_128( B6, C6 ); \
+    mm128_swap256_128( B7, C7 ); \
+    mm128_swap256_128( B8, C8 ); \
+    mm128_swap256_128( B9, C9 ); \
+    mm128_swap256_128( BA, CA ); \
+    mm128_swap256_128( BB, CB ); \
+    mm128_swap256_128( BC, CC ); \
+    mm128_swap256_128( BD, CD ); \
+    mm128_swap256_128( BE, CE ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)

 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
@@ -271,9 +271,9 @@ do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
            _mm_andnot_si128( xb3, xb2 ), \
            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
-   xb0 = mm_not( _mm_xor_si128( xa0, mm_rol_32( xb0, 1 ) ) ); \
+   xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
 } while (0)

 #define PERM_STEP_0   do { \
@@ -335,22 +335,22 @@ do { \

 #define APPLY_P \
 do { \
-    B0 = mm_ror_32( B0, 15 ); \
-    B1 = mm_ror_32( B1, 15 ); \
-    B2 = mm_ror_32( B2, 15 ); \
-    B3 = mm_ror_32( B3, 15 ); \
-    B4 = mm_ror_32( B4, 15 ); \
-    B5 = mm_ror_32( B5, 15 ); \
-    B6 = mm_ror_32( B6, 15 ); \
-    B7 = mm_ror_32( B7, 15 ); \
-    B8 = mm_ror_32( B8, 15 ); \
-    B9 = mm_ror_32( B9, 15 ); \
-    BA = mm_ror_32( BA, 15 ); \
-    BB = mm_ror_32( BB, 15 ); \
-    BC = mm_ror_32( BC, 15 ); \
-    BD = mm_ror_32( BD, 15 ); \
-    BE = mm_ror_32( BE, 15 ); \
-    BF = mm_ror_32( BF, 15 ); \
+    B0 = mm128_ror_32( B0, 15 ); \
+    B1 = mm128_ror_32( B1, 15 ); \
+    B2 = mm128_ror_32( B2, 15 ); \
+    B3 = mm128_ror_32( B3, 15 ); \
+    B4 = mm128_ror_32( B4, 15 ); \
+    B5 = mm128_ror_32( B5, 15 ); \
+    B6 = mm128_ror_32( B6, 15 ); \
+    B7 = mm128_ror_32( B7, 15 ); \
+    B8 = mm128_ror_32( B8, 15 ); \
+    B9 = mm128_ror_32( B9, 15 ); \
+    BA = mm128_ror_32( BA, 15 ); \
+    BB = mm128_ror_32( BB, 15 ); \
+    BC = mm128_ror_32( BC, 15 ); \
+    BD = mm128_ror_32( BD, 15 ); \
+    BE = mm128_ror_32( BE, 15 ); \
+    BF = mm128_ror_32( BF, 15 ); \
    PERM_STEP_0; \
    PERM_STEP_1; \
    PERM_STEP_2; \
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -64,11 +64,11 @@ static const sph_u32 IV512[] = {
 // a[3:0] = { b[0], a[3], a[2], a[1] }
 #if defined(__SSSE3__)

-#define mm_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )
+#define mm128_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )

 #else  // SSE2

-#define mm_ror256hi_1x32( a, b ) \
+#define mm128_ror256hi_1x32( a, b ) \
   _mm_or_si128( _mm_srli_si128( a,  4 ), \
                 _mm_slli_si128( b, 12 ) )

@@ -136,7 +136,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
   for ( r = 0; r < 3; r ++ )
   {
      // round 1, 5, 9
-      k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
+      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 ); 

      if ( r == 0 )
@@ -145,7 +145,7 @@ c512( sph_shavite_big_context *sc, const void *msg )

      x = _mm_xor_si128( p0, k00 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
+      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      if ( r == 1 )
@@ -154,33 +154,33 @@ c512( sph_shavite_big_context *sc, const void *msg )

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
+      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
+      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );
-      k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
+      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p2, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
+      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
+      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
+      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      if ( r == 2 )
@@ -193,80 +193,80 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, mm_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
      x = _mm_xor_si128( p3, k00 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = _mm_xor_si128( k01, mm_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k02 = _mm_xor_si128( k02, mm_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k03 = _mm_xor_si128( k03, mm_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );

      p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
      x = _mm_xor_si128( p1, k10 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k11 = _mm_xor_si128( k11, mm_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k12 = _mm_xor_si128( k12, mm_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k13 = _mm_xor_si128( k13, mm_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );

      // round 3, 7, 11

-      k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
+      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 );

      x = _mm_xor_si128( p2, k00 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
+      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
+      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
+      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );
-      k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
+      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p0, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
+      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
+      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
+      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      x = _mm_xor_si128( x, k13 );
@@ -275,36 +275,36 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, mm_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );

      x = _mm_xor_si128( p1, k00 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k01 = _mm_xor_si128( k01, mm_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = _mm_xor_si128( k02, mm_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = _mm_xor_si128( k03, mm_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );

      x = _mm_xor_si128( p3, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = _mm_xor_si128( k11, mm_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = _mm_xor_si128( k12, mm_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = _mm_xor_si128( k13, mm_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );

      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, m128_zero );
@@ -313,44 +313,44 @@ c512( sph_shavite_big_context *sc, const void *msg )

   // round 13

-   k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
+   k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
   k00 = _mm_xor_si128( k00, k13 );

   x = _mm_xor_si128( p0, k00 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
+   k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
   k01 = _mm_xor_si128( k01, k00 );

   x = _mm_xor_si128( x, k01 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
+   k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
   k02 = _mm_xor_si128( k02, k01 );

   x = _mm_xor_si128( x, k02 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
+   k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
   k03 = _mm_xor_si128( k03, k02 );

   x = _mm_xor_si128( x, k03 );
   x = _mm_aesenc_si128( x, m128_zero );
   p3 = _mm_xor_si128( p3, x );
-   k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
+   k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
   k10 = _mm_xor_si128( k10, k03 );

   x = _mm_xor_si128( p2, k10 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
+   k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
   k11 = _mm_xor_si128( k11, k10 );

   x = _mm_xor_si128( x, k11 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
+   k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );

   x = _mm_xor_si128( x, k12 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
+   k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
   k13 = _mm_xor_si128( k13, k12 );

   x = _mm_xor_si128( x, k13 );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -198,13 +198,13 @@ do { \
 #undef BUTTERFLY_N

  // Multiply by twiddle factors
-  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
-  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
-  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
-  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
-  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
-  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
-  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
+  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].v256 );
+  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].v256 );
+  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].v256 );
+  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].v256 );
+  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].v256 );
+  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].v256 );
+  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].v256 );

  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
@@ -319,7 +319,7 @@ void fft128_2way( void *a )
    B[ i ]   = REDUCE_FULL_S( B[ i ] );
    A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
-    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
+    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].v256 );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
  }

@@ -347,10 +347,10 @@ void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
 do { \
    __m256i t = X[i]; \
    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
-    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].v256 ); \
    A[2*i+8] = REDUCE(A[2*i+8]); \
    A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
-    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
+    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].v256 ); \
    A[2*i+9] = REDUCE(A[2*i+9]); \
 } while(0)

@@ -360,12 +360,12 @@ do { \
    __m256i t = X[i]; \
    __m256i tmp; \
    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
-    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].v256 ); \
    A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
    tmp      = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
    A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
-                                   FFT128_Twiddle[ 2*i+1 ].m256i );\
+                                   FFT128_Twiddle[ 2*i+1 ].v256 );\
    A[2*i+9] = REDUCE( A[ 2*i+9 ] );                       \
 } while(0)

@@ -373,9 +373,9 @@ do { \
  UNPACK( 1 );
  UNPACK( 2 );
  if ( final )
-    UNPACK_TWEAK( 3, FinalTweak.m256i );
+    UNPACK_TWEAK( 3, FinalTweak.v256 );
  else
-    UNPACK_TWEAK( 3, Tweak.m256i );
+    UNPACK_TWEAK( 3, Tweak.v256 );

 #undef UNPACK
 #undef UNPACK_TWEAK
@@ -398,11 +398,11 @@ do { \
    __m256i t = X[i]; \
    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
-                                        FFT256_Twiddle[ 2*i ].m256i ); \
+                                        FFT256_Twiddle[ 2*i ].v256 ); \
    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
    A[ 2*i +  1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
-                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+                                        FFT256_Twiddle[ 2*i + 1 ].v256 ); \
    A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
 } while(0)

@@ -413,12 +413,12 @@ do { \
    __m256i tmp; \
    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
-                                        FFT256_Twiddle[ 2*i ].m256i ); \
+                                        FFT256_Twiddle[ 2*i ].v256 ); \
    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
    tmp           = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[ 2*i +  1 ] = _mm256_add_epi16( tmp, tw ); \
    A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
-                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+                                        FFT256_Twiddle[ 2*i + 1 ].v256 ); \
  } while(0)

  UNPACK( 0 );
@@ -429,9 +429,9 @@ do { \
  UNPACK( 5 );
  UNPACK( 6 );
  if ( final )
-    UNPACK_TWEAK( 7, FinalTweak.m256i );
+    UNPACK_TWEAK( 7, FinalTweak.v256 );
  else
-    UNPACK_TWEAK( 7, Tweak.m256i );
+    UNPACK_TWEAK( 7, Tweak.v256 );

 #undef UNPACK
 #undef UNPACK_TWEAK
@@ -447,7 +447,7 @@ void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
  __m256i *S = (__m256i*) state;
  __m256i *M = (__m256i*) msg;
  __m256i *W = (__m256i*) fft;
-  static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
+  static const m256_v16 code[] = { mm256_const1_16(185), mm256_const1_16(233) };

  S0l = _mm256_xor_si256( S[0], M[0] );
  S0h = _mm256_xor_si256( S[1], M[1] );
@@ -612,9 +612,9 @@ do { \
    int a = MSG_##u(hh); \
    int b = MSG_##u(ll); \
    w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
-    w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
+    w##l = _mm256_mullo_epi16( w##l, code[z].v256 ); \
    w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
-    w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
+    w##h = _mm256_mullo_epi16( w##h, code[z].v256 ); \
 } while(0)

 #define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -21,9 +21,10 @@ void skeinhash_4way( void *state, const void *input )

     sha256_4way_init( &ctx_sha256 );
     sha256_4way( &ctx_sha256, vhash32, 64 );
-     sha256_4way_close( &ctx_sha256, vhash32 );
+     sha256_4way_close( &ctx_sha256, state );

-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash32, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96,
+		              vhash32, 256 );
 }

 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -31,6 +32,8 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8];
+    uint32_t *hash7 = &(hash[7<<2]);
    uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -58,12 +61,16 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,

       skeinhash_4way( hash, vdata );

-       for ( int i = 0; i < 4; i++ )
-       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+       for ( int lane = 0; lane < 4; lane++ )
+       if (  hash7[ lane ] <= Htarg )
       {
-           pdata[19] = n+i;
-           nonces[ num_found++ ] = n+i;
-           work_set_target_ratio( work, hash+(i<<3) );
+          mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+             pdata[19] = n + lane;
+             nonces[ num_found++ ] = n + lane;
+             work_set_target_ratio( work, lane_hash );
+          }
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -9,7 +9,6 @@ void skein2hash_4way( void *output, const void *input )
 {
   skein512_4way_context ctx;
   uint64_t hash[8*4] __attribute__ ((aligned (64)));
-   uint64_t *out64 = (uint64_t*)output;

   skein512_4way_init( &ctx );
   skein512_4way( &ctx, input, 80 );
@@ -17,15 +16,14 @@ void skein2hash_4way( void *output, const void *input )

   skein512_4way_init( &ctx );
   skein512_4way( &ctx, hash, 64 );
-   skein512_4way_close( &ctx, hash );
-
-   mm256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
+   skein512_4way_close( &ctx, output );
 }

 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[25]);
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__ ((aligned (64)));
    uint64_t *edata = (uint64_t*)endiandata;
@@ -34,7 +32,6 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
    int num_found = 0;

@@ -53,12 +50,18 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,

       skein2hash( hash, vdata );

-       for ( int i = 0; i < 4; i++ )
-       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash7[ lane ] <= Htarg )
       {
-          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          // deinterleave hash for lane
+          uint32_t lane_hash[8];
+          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+             pdata[19] = n + lane;
+             nonces[ num_found++ ] = n + lane;
+             work_set_target_ratio( work, lane_hash );
+          }
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -125,20 +125,20 @@ void sm3_4way_close( void *cc, void *dst )
      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }

-   count[0] = mm_bswap_32(
+   count[0] = mm128_bswap_32(
                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
                                              ( ctx->num     << 3 ) ) );
   sm3_4way_compress( ctx->digest, block );

   for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm_bswap_32( ctx->digest[i] );
+     hash[i] = mm128_bswap_32( ctx->digest[i] );
 }

-#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rol_32( x,  9 ), \
-                                               mm_rol_32( x, 17 ) ) ) 
-#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rol_32( x, 15 ), \
-                                               mm_rol_32( x, 23 ) ) ) 
+#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
+                                               mm128_rol_32( x, 17 ) ) ) 
+#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
+                                               mm128_rol_32( x, 23 ) ) ) 

 #define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
 #define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
@@ -165,13 +165,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   int j;

   for ( j = 0; j < 16; j++ )
-      W[j] = mm_bswap_32( block[j] );
+      W[j] = mm128_bswap_32( block[j] );

   for ( j = 16; j < 68; j++ )
      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
                                                              W[ j-9 ] ),
-                                               mm_rol_32( W[ j-3 ], 15 ) ) ),
-                            _mm_xor_si128( mm_rol_32( W[ j-13 ], 7 ),
+                                               mm128_rol_32( W[ j-3 ], 15 ) ) ),
+                            _mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ),
                                           W[ j-6 ] ) );

   for( j = 0; j < 64; j++ )
@@ -180,19 +180,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x79CC4519UL );
   for( j =0; j < 16; j++ )
   {
-      SS1 = mm_rol_32( _mm_add_epi32( _mm_add_epi32( mm_rol_32( A, 12 ), E ),
-                                      mm_rol_32( T, j ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm_rol_32( A, 12 ) );
+      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+                                      mm128_rol_32( T, j ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm_rol_32( B, 9 );
+      C = mm128_rol_32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm_rol_32( F, 19 );
+      G = mm128_rol_32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -200,19 +200,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x7A879D8AUL );
   for( j =16; j < 64; j++ )
   {
-      SS1 = mm_rol_32( _mm_add_epi32( _mm_add_epi32( mm_rol_32( A, 12 ), E ),
-                                      mm_rol_32( T, j&31 ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm_rol_32( A, 12 ) );
+      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+                                      mm128_rol_32( T, j&31 ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm_rol_32( B, 9 );
+      C = mm128_rol_32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm_rol_32( F, 19 );
+      G = mm128_rol_32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -23,7 +23,7 @@
 #endif

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -10,7 +10,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
 #else
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"

--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -9,7 +9,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -12,7 +12,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -20,7 +20,7 @@
 #endif

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"  
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -15,7 +15,7 @@
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"

 typedef struct {
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -23,7 +23,7 @@
 #endif

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"

 typedef struct {
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -14,7 +14,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -11,7 +11,7 @@
 #include "algo/echo/sph_echo.h"

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -20,7 +20,7 @@
 //#include "algo/fugue/sph_fugue.h"

 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"   
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
@@ -238,6 +238,8 @@ void drop_display_pok( struct work* work )
        applog(LOG_BLUE, "POK received: %08xx", work->data[0] );
 }

+int drop_get_work_data_size() { return 80; }
+
 // Need to fix POK offset problems like zr5
 bool register_drop_algo( algo_gate_t* gate )
 {
@@ -250,8 +252,8 @@ bool register_drop_algo( algo_gate_t* gate )
    gate->work_decode           = (void*)&std_be_work_decode;
    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
-    gate->display_extra_data    = (void*)&drop_display_pok;
-    gate->work_data_size        = 80;
+    gate->decode_extra_data     = (void*)&drop_display_pok;
+    gate->get_work_data_size    = (void*)&drop_get_work_data_size;
    gate->work_cmp_size         = 72;
    return true;
 };
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -8,7 +8,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue//sph_fugue.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sph_jh.h"

--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -9,7 +9,7 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/fugue/sph_fugue.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"

 typedef struct {
    skein512_4way_context skein;
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -6,7 +6,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/fugue/sph_fugue.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"

 typedef struct {
    sph_skein512_context  skein;
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -20,7 +20,7 @@
 #include "algo/fugue/sph_fugue.h"

 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"   
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -166,7 +166,7 @@ void x13sm3_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     // SM3 parallel 32 bit
     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
@@ -182,7 +182,7 @@ void x13sm3_4way_hash( void *state, const void *input )

     sm3_4way( &ctx.sm3, vhash, 64 );
     sm3_4way_close( &ctx.sm3, sm3_vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );

     // Hamsi parallel 4x32x2
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -16,9 +16,8 @@
 #include "algo/sm3/sph_sm3.h"

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
-#include "algo/echo/sse2/sph_echo.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -52,7 +52,7 @@ void polytimos_4way_hash( void *output, const void *input )
     mm256_reinterleave_4x32( vhash32, vhash, 512 );
     shabal512_4way( &ctx.shabal, vhash32, 64 );
     shabal512_4way_close( &ctx.shabal, vhash32 );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );

     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                         (const BitSequence *)hash0, 512 );
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -183,21 +183,16 @@ void x14_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
-     shabal512_4way_close( &ctx.shabal, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
-     
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
+     shabal512_4way_close( &ctx.shabal, state );
+
 }

 int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
@@ -233,13 +228,21 @@ int scanhash_x14_4way( int thr_id, struct work *work, uint32_t max_nonce,
            x14_4way_hash( hash, vdata );
            pdata[19] = n;

-            for ( int i = 0; i < 4; i++ )
-            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                 && fulltest( hash+(i<<3), ptarget ) )
+            uint32_t *hash7 = &(hash[7<<2]);
+
+            for ( int lane = 0; lane < 4; lane++ )
+            if ( ( hash7[ lane ] & mask ) == 0 )
            {
-               pdata[19] = n+i;
-               nonces[ num_found++ ] = n+i;
-               work_set_target_ratio( work, hash+(i<<3) );
+               // deinterleave hash for lane
+               uint32_t lane_hash[8];
+               mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+               if ( fulltest( lane_hash, ptarget ) )
+               {
+                  pdata[19] = n + lane;
+                  nonces[ num_found++ ] = n + lane;
+                  work_set_target_ratio( work, lane_hash );
+               }
            }
            n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -21,9 +21,8 @@
 #include "algo/shabal/sph_shabal.h"

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
-#include "algo/echo/sse2/sph_echo.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -22,7 +22,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"

 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -24,7 +24,7 @@
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/jh/sse2/jh_sse2_opt64.h"

--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -20,7 +20,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -256,11 +256,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_fugue512_close( &ctx.fugue, hash3 );
         break;
         case SHABAL:
-             mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             mm128_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
--- a/algo/x17/x16r.c
+++ b/algo/x17/x16r.c
@@ -17,7 +17,7 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -32,7 +32,7 @@ typedef struct {
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
-    cubehashParam           cube;
+    cube_2way_context       cube;
    sph_shavite512_context  shavite;
    simd_2way_context       simd;
    hashState_echo          echo;
@@ -55,7 +55,7 @@ void init_x17_4way_ctx()
     jh512_4way_init( &x17_4way_ctx.jh );
     keccak512_4way_init( &x17_4way_ctx.keccak );
     luffa_2way_init( &x17_4way_ctx.luffa, 512 );
-     cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
+     cube_2way_init( &x17_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x17_4way_ctx.shavite );
     simd_2way_init( &x17_4way_ctx.simd, 512 );
     init_echo( &x17_4way_ctx.echo, 512 );
@@ -73,11 +73,11 @@ void x17_4way_hash( void *state, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     uint64_t vhash32[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
     x17_4way_ctx_holder ctx;
     memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );

-     // 1 Blake
+     // 1 Blake 4 way 64 bit
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

@@ -114,25 +114,22 @@ void x17_4way_hash( void *state, const void *input )

     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 7 Luffa
-     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     // 7 Luffa  parallel 2 way
+     mm256_interleave_2x128( vhash,  hash0, hash1, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
-     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
     luffa_2way_init( &ctx.luffa, 512 );
-     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );

-     // 8 Cubehash
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
-     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
-     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
-     memcpy( &ctx.cube, &x17_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+     // 8 Cubehash parallel 2 way
+     cube_2way_update_close( &ctx.cube, vhash, vhash, 64 );
+     cube_2way_reinit( &ctx.cube );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );

-     // 9 Shavite
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
+
+     // 9 Shavite serial
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
@@ -148,7 +145,7 @@ void x17_4way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     // 10 Simd
+     // 10 Simd parallel 2 way 128 bit
     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
@@ -157,7 +154,7 @@ void x17_4way_hash( void *state, const void *input )
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );

-     // 11 Echo
+     // 11 Echo serial
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
@@ -170,13 +167,13 @@ void x17_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     // 12 Hamsi
+     // 12 Hamsi parallel 4 way 64 bit
     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 13 Fugue
+     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
@@ -189,11 +186,11 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     // 14 Shabal, parallel 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     // 14 Shabal, parallel 4 way 32 bit SSE
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -217,17 +214,18 @@ void x17_4way_hash( void *state, const void *input )
     sha512_4way_close( &ctx.sha512, vhash );     

     // 17 Haval parallel 32 bit
-     mm256_reinterleave_4x32( vhash32, vhash,  512 );
-     haval256_5_4way( &ctx.haval, vhash32, 64 );
-     haval256_5_4way_close( &ctx.haval, vhash );
+     mm256_reinterleave_4x32( vhashB, vhash,  512 );
+     haval256_5_4way( &ctx.haval, vhashB, 64 );
+     haval256_5_4way_close( &ctx.haval, state );

-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[7<<2]);
+     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
@@ -261,17 +259,20 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
            be32enc( noncep+6, n+3 );

            x17_4way_hash( hash, vdata );
-            pdata[19] = n;

-            for ( int i = 0; i < 4; i++ )
-            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                 && fulltest( hash+(i<<3), ptarget ) )
+            for ( int lane = 0; lane < 4; lane++ )
+            if ( ( ( hash7[ lane ] & mask ) == 0 ) )
            {
-               pdata[19] = n+i;
-               nonces[ num_found++ ] = n+i;
-               work_set_target_ratio( work, hash+(i<<3) );
+               mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+               if ( fulltest( lane_hash, ptarget ) )
+               {
+                  pdata[19] = n + lane;
+                  nonces[ num_found++ ] = n + lane;
+                  work_set_target_ratio( work, lane_hash );
+               }
            }
-            n += 4;
+	    n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
                   && !work_restart[thr_id].restart );
         break;
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -22,7 +22,7 @@
 #include "algo/haval/sph-haval.h"

 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -14,7 +14,7 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -201,10 +201,10 @@ void xevan_4way_hash( void *output, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // Parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     // Serial
     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
@@ -229,7 +229,7 @@ void xevan_4way_hash( void *output, const void *input )
     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
     haval256_5_4way( &ctx.haval, vhash32, dataLen );
     haval256_5_4way_close( &ctx.haval, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
@@ -339,10 +339,10 @@ void xevan_4way_hash( void *output, const void *input )
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
@@ -365,16 +365,15 @@ void xevan_4way_hash( void *output, const void *input )

     mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
     haval256_5_4way( &ctx.haval, vhash32, dataLen );
-     haval256_5_4way_close( &ctx.haval, vhash32 );
-
-     mm_deinterleave_4x32( output, output+32, output+64, output+96,
-                           vhash32, 256 );
+     haval256_5_4way_close( &ctx.haval, output );
 }

 int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) endiandata[20];
   uint32_t *pdata = work->data;
@@ -405,15 +404,16 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( noncep+6, n+3 );

      xevan_4way_hash( hash, vdata );
-
-      pdata[19] = n;
-
-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( hash7[ lane ] <= Htarg )
      {
-         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+	 if ( fulltest( lane_hash, ptarget ) )
+         {
+             pdata[19] = n + lane;
+             nonces[ num_found++ ] = n + lane;
+             work_set_target_ratio( work, lane_hash );
+         }
      }
      n += 4;
   } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -19,7 +19,7 @@
 #include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/simd/nist.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include <openssl/sha.h>
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
--- a/Show More
+++ b/Show More