v3.7.6

2025-09-17 23:44:27 +00:00 · 2017-12-14 18:28:51 -05:00
parent af1c940919
commit 7a1389998b
31 changed files with 1285 additions and 377 deletions
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -32,12 +32,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[4*4] __attribute__ ((aligned (32)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
 //   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) endiandata[20];
+   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
@@ -47,18 +47,17 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
 //      HTarget = 0x7f;

   // we need big endian data...
-   swab32_array( endiandata, pdata, 20 );
+   swab32_array( edata, pdata, 20 );

-   mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
-                         endiandata, 640 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
-      be32enc( noncep +2, n+1 );
-      be32enc( noncep +4, n+2 );
-      be32enc( noncep +6, n+3 );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );

      blakehash_4way( hash, vdata );

@@ -74,7 +73,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+8)[7] == 0 ) 
      {
-         if ( fulltest( hash, ptarget ) ) 
+         if ( fulltest( hash+8, ptarget ) ) 
         {
             found[1] = true;
             num_found++;
@@ -83,7 +82,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+16)[7] == 0 )
      {
-          if ( fulltest( hash, ptarget ) )
+          if ( fulltest( hash+8, ptarget ) )
          {
              found[2] = true;
              num_found++;
@@ -92,15 +91,14 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+24)[7] == 0 )
      {
-         if ( fulltest( hash, ptarget ) )
+         if ( fulltest( hash+8, ptarget ) )
         {
              found[3] = true;
              num_found++;
              nonces[3] = n+3;
         }
      }
- 
-      n += 4;
+       n += 4;
      *hashes_done = n - first_nonce + 1;

   } while ( (num_found == 0) && (n < max_nonce) 
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate )
  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
-  four_way_not_tested();
 #else
  gate->scanhash  = (void*)&scanhash_blake;
  gate->hash      = (void*)&blakehash;
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -524,18 +524,18 @@ do { \
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \
-        V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \
-        VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \
-        VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
-                            _mmset_epi32( CS4, CS4, CS4, CS4 ) ); \
-        VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
-                            _mmset_epi32( CS5, CS5, CS5, CS5 ) ); \
-        VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \
-                          , _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
-        VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
-                            _mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
+        V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
+        V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
+        VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
+        VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
+        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
+                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
+        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
+                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
+        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
+                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
+        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
+                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
 	M[0x0] = mm_byteswap_32( *(buf +  0) ); \
 	M[0x1] = mm_byteswap_32( *(buf +  1) ); \
 	M[0x2] = mm_byteswap_32( *(buf +  2) ); \
@@ -710,18 +710,18 @@ do { \
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \
-        V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \
-        VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \
-        VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \
-        VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \
-        VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \
-        VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
-                               _mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
-        VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
-                              _mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
+        V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
+        V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
+        VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
+        VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
+        VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                               _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
+        VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                               _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
+        VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
+        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
 	M[0x0] = mm256_byteswap_64( *(buf+0) ); \
 	M[0x1] = mm256_byteswap_64( *(buf+1) ); \
 	M[0x2] = mm256_byteswap_64( *(buf+2) ); \
@@ -867,7 +867,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )

 	buf = sc->buf;
 	ptr = sc->ptr;
-
 	if ( len < buf_size - ptr )
        {
 		memcpy_128( buf + (ptr>>2), vdata, len>>2 );
@@ -915,9 +914,10 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   unsigned z = 0x80 >> n;
-   unsigned zz = ((ub & -z) | z) & 0xFF;
-   u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
+//   unsigned z = 0x80 >> n;
+//   unsigned zz = ((ub & -z) | z) & 0xFF;
+//   u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
+   u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -934,9 +934,11 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
   else
 	sc->T0 -= 512 - bit_len;

-   if ( ptr <= 48 )
+//   if ( ptr <= 48 )
+   if ( ptr <= 52 )
   {
-       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
+       memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+//       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
                                    _mm_set_epi32( 0x010000000, 0x01000000,
@@ -962,6 +964,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
        out[k] = mm_byteswap_32( sc->H[k] );
+//        out[k] =  sc->H[k];
 }

 #if defined (__AVX2__)
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -13,46 +13,35 @@ static __thread bool ctx_midstate_done = false;

 void decred_hash_4way( void *state, const void *input )
 {
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint32_t hash0[8] __attribute__ ((aligned (32)));
+     uint32_t hash1[8] __attribute__ ((aligned (32)));
+     uint32_t hash2[8] __attribute__ ((aligned (32)));
+     uint32_t hash3[8] __attribute__ ((aligned (32)));
     blake256_4way_context ctx __attribute__ ((aligned (64)));

     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
     uint32_t hash[16] __attribute__ ((aligned (64)));
     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
+
     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );

-     void *tail = input + DECRED_MIDSTATE_LEN;
+     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-//     #define MIDSTATE_LEN 128
-/*
-        uint8_t *ending = (uint8_t*) input;
-        ending += MIDSTATE_LEN;

-     if ( !ctx_midstate_done )
-     {
-          blake256_4way_init( &blake_mid );
-          blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
-          ctx_midstate_done = true;
-     }
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-*/
-
-
+/*
     sph_blake256_init( &ctx2 );
     sph_blake256( &ctx2, sin0, 180 );
     sph_blake256_close( &ctx2, hash );
-
+*/
+/*
     blake256_4way_init( &ctx );
     blake256_4way( &ctx, input, 180 );
     blake256_4way_close( &ctx, vhash );
-
+*/
     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
 /*
        for ( int i = 0; i < 8; i++ )
@@ -66,22 +55,21 @@ printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
 printf("\n");
 */

-//     memcpy( state,    hash0, 32 );
-//     memcpy( state+32, hash1, 32 );
-//     memcpy( state+64, hash1, 32 );
-//     memcpy( state+96, hash1, 32 );
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );

-     memcpy( state, hash, 32 );
+//     memcpy( state, hash, 32 );

 }

 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done)
 {
-   uint32_t vdata[45*4] __attribute__ ((aligned (64)));
-   uint32_t hash[4*4] __attribute__ ((aligned (32)));
-        uint32_t _ALIGN(64) endiandata[48];
-//        uint32_t _ALIGN(64) hash32[8];
+   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+        uint32_t _ALIGN(64) edata[48];
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
@@ -91,28 +79,25 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   bool *found = work->nfound;
   int num_found = 0;

-//        #define DCR_NONCE_OFT32 35
-
        ctx_midstate_done = false;
-
-//        memcpy(endiandata, pdata, 180);
+        memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
+
+   blake256_4way_init( &blake_mid );
+   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      * noncep    = n;
-      *(noncep+2) = n+1;
-      *(noncep+4) = n+2;
-      *(noncep+6) = n+3;
+      *(noncep+1) = n+1;
+      *(noncep+2) = n+2;
+      *(noncep+3) = n+3;

      decred_hash_4way( hash, vdata );

-//                endiandata[DCR_NONCE_OFT32] = n;
-//                decred_hash(hash32, endiandata);
-
      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
      {
          work_set_target_ratio( work, hash );
@@ -121,29 +106,47 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
          nonces[0] = n;
          pdata[DECRED_NONCE_INDEX] = n;
      }
-/*      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
+/*
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
+printf("found 1\n");          
+
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
+printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
+printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
+
          work_set_target_ratio( work, hash+8 );
          found[1] = true;
          num_found++;
-          nonces[1] = n;
+          nonces[1] = n+1;
      }
+*/
      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
          work_set_target_ratio( work, hash+16 );
          found[2] = true;
          num_found++;
-          nonces[2] = n;
+          nonces[2] = n+2;
      }
+/*
      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
+printf("found 3\n");          
+
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
+printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
+printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
+
          work_set_target_ratio( work, hash+24 );
          found[3] = true;
          num_found++;
-          nonces[3] = n;
+          nonces[3] = n+3;
      }
 */
-      n += 4;
+      n += 2;
+//      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
            && !work_restart[thr_id].restart );

--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
 	} else {
 		sc->T0 -= 512 - bit_len;
 	}
+
 	if (bit_len <= 446) {
 		memset(u.buf + ptr + 1, 0, 55 - ptr);
 		if (out_size_w32 == 8)
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -25,7 +25,7 @@ void jha_hash_4way( void *out, const void *input )
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
    uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
    uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
-    __m256i mask0, mask1;
+    __m256i mask, mask0, mask1;
    __m256i* vh = (__m256i*)vhash;
    __m256i* vh0 = (__m256i*)vhash0;
    __m256i* vh1 = (__m256i*)vhash1;
@@ -47,38 +47,37 @@ void jha_hash_4way( void *out, const void *input )
    // Heavy & Light Pair Loop
    for ( int round = 0; round < 3; round++ )
    {
-//       memset_zero_256( vh0, 20 );
-//       memset_zero_256( vh1, 20 );
-
-      // positive logic, if maski select vhi
-      // going from bit to mask reverses logic such that if the test bit is set
-      // zero will be put in mask0, meaning don't take vh0. mask1 is
-      // inverted so 1 will be put in mask1 meaning take it.
-      mask0 = mm256_negate_64(
+      // select next function based on bit 0 of previous hash.
+      // Specutively execute both functions and use mask to
+      // select results from correct function for each lane.
+      // hash = mask : vhash0 ? vhash1
+      mask = mm256_negate_64(
                     _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
-      mask1 = mm256_not( mask0 );

+// second version
+//      mask0 = mask
+//      mask1 = mm256_not( mask );
+
+// first version
 //       mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
 //                     _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );

-       // groestl (serial) v skein
+       // groestl (serial) vs skein

       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
-                     (char*)hash0, 512 );
-
+                                 (char*)hash0, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash1,
-                                          (char*)hash1, 512 );
-
+                                 (char*)hash1, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash2,
-                                          (char*)hash2, 512 );
+                                 (char*)hash2, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
-                                          (char*)hash3, 512 );
+                                 (char*)hash3, 512 );

       mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );

@@ -91,14 +90,20 @@ void jha_hash_4way( void *out, const void *input )
       // merge vectored hash
       for ( int i = 0; i < 8; i++ )
       {
-          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
-                                   _mm256_and_si256( vh1[i], mask1 ) );
+          // blend should be faster
+          vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
+
+// second version
+//          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
+//                                   _mm256_and_si256( vh1[i], mask1 ) );
+
+// first version
 /*
-          vha256[i] = _mm256_maskload_epi64( 
-                                      vhasha + i*4, mm256_not( mask ) );
-          vhb256[i] = _mm256_maskload_epi64(
-                                      vhashb + i*4, mask );
-          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+          vh0[i] = _mm256_maskload_epi64( 
+                                      vhash0 + i*4, mm256_not( mask ) );
+          vh1[i] = _mm256_maskload_epi64(
+                                      vhash1 + i*4, mask );
+          vh[i]  = _mm256_or_si256( vh0[i], vh1[i] );
 */
       }

--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -0,0 +1,93 @@
+#include <memory.h>
+#include <mm_malloc.h>
+#include "algo-gate-api.h"
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+
+__thread uint64_t* lyra2h_matrix;
+
+bool lyra2h_thread_init()
+{
+   const int i = 16 * 16 * 96;
+   lyra2h_matrix = _mm_malloc( i, 64 );
+   return lyra2h_matrix;
+}
+
+static __thread sph_blake256_context lyra2h_blake_mid;
+
+void lyra2h_midstate( const void* input )
+{
+       sph_blake256_init( &lyra2h_blake_mid );
+       sph_blake256( &lyra2h_blake_mid, input, 64 );
+}
+
+void lyra2h_hash( void *state, const void *input )
+{
+        uint32_t _ALIGN(64) hash[16];
+
+        sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
+
+        memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid );
+        sph_blake256( &ctx_blake, input + 64, 16 );
+        sph_blake256_close( &ctx_blake, hash );
+
+        LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
+
+    memcpy(state, hash, 32);
+}
+
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+	uint32_t _ALIGN(64) hash[8];
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0000ff;
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+        lyra2h_midstate( endiandata );
+
+	do {
+		be32enc(&endiandata[19], nonce);
+                lyra2h_hash( hash, endiandata );
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+void lyra2h_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2h_set_target;
+  return true;
+};
+
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -110,7 +110,8 @@ printf("found 0\n");
          nonces[0] = pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
-/*      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+/*
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
 printf("found 1\n");          
          found[1] = true;
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotl256_1x64( s1); \
+   s1 = mm256_rotr256_1x64( s1); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rotr256_1x64( s3 ); \
+   s3 = mm256_rotl256_1x64( s3 ); \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotr256_1x64( s1 ); \
+   s1 = mm256_rotl256_1x64( s1 ); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rotl256_1x64( s3 );
+   s3 = mm256_rotr256_1x64( s3 );

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -0,0 +1,673 @@
+/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHAvite-3 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AES__
+
+#include "sph_shavite.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
+#define SPH_SMALL_FOOTPRINT_SHAVITE   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define C32   SPH_C32
+
+/*
+ * As of round 2 of the SHA-3 competition, the published reference
+ * implementation and test vectors are wrong, because they use
+ * big-endian AES tables while the internal decoding uses little-endian.
+ * The code below follows the specification. To turn it into a code
+ * which follows the reference implementation (the one called "BugFix"
+ * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
+ * the code below (from the '#define AES_BIG_ENDIAN...' to the definition
+ * of the AES_ROUND_NOKEY macro) and replace it with the version which
+ * is commented out afterwards.
+ */
+
+#define AES_BIG_ENDIAN   0
+#include "algo/sha/aes_helper.c"
+
+static const sph_u32 IV512[] = {
+	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
+	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
+	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
+	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+  
+#define KEY_EXPAND_ELT(k0, k1, k2, k3)   do { \
+		sph_u32 kt; \
+		AES_ROUND_NOKEY(k1, k2, k3, k0); \
+		kt = (k0); \
+		(k0) = (k1); \
+		(k1) = (k2); \
+		(k2) = (k3); \
+		(k3) = kt; \
+	} while (0)
+
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 rk[448];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 128);
+#else
+	for (u = 0; u < 32; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 32;
+	for (;;) {
+		for (s = 0; s < 4; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 32) {
+				rk[ 32] ^= sc->count0;
+				rk[ 33] ^= sc->count1;
+				rk[ 34] ^= sc->count2;
+				rk[ 35] ^= SPH_T32(~sc->count3);
+			} else if (u == 440) {
+				rk[440] ^= sc->count1;
+				rk[441] ^= sc->count0;
+				rk[442] ^= sc->count3;
+				rk[443] ^= SPH_T32(~sc->count2);
+			}
+			u += 4;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 164) {
+				rk[164] ^= sc->count3;
+				rk[165] ^= sc->count2;
+				rk[166] ^= sc->count1;
+				rk[167] ^= SPH_T32(~sc->count0);
+			} else if (u == 316) {
+				rk[316] ^= sc->count2;
+				rk[317] ^= sc->count3;
+				rk[318] ^= sc->count0;
+				rk[319] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		if (u == 448)
+			break;
+		for (s = 0; s < 8; s ++) {
+			rk[u + 0] = rk[u - 32] ^ rk[u - 7];
+			rk[u + 1] = rk[u - 31] ^ rk[u - 6];
+			rk[u + 2] = rk[u - 30] ^ rk[u - 5];
+			rk[u + 3] = rk[u - 29] ^ rk[u - 4];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	u = 0;
+	for (r = 0; r < 14; r ++) {
+#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3)   do { \
+		sph_u32 x0, x1, x2, x3; \
+		x0 = r0 ^ rk[u ++]; \
+		x1 = r1 ^ rk[u ++]; \
+		x2 = r2 ^ rk[u ++]; \
+		x3 = r3 ^ rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		l0 ^= x0; \
+		l1 ^= x1; \
+		l2 ^= x2; \
+		l3 ^= x3; \
+	} while (0)
+
+#define WROT(a, b, c, d)   do { \
+		sph_u32 t = d; \
+		d = c; \
+		c = b; \
+		b = a; \
+		a = t; \
+	} while (0)
+
+		C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
+		C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
+
+		WROT(p0, p4, p8, pC);
+		WROT(p1, p5, p9, pD);
+		WROT(p2, p6, pA, pE);
+		WROT(p3, p7, pB, pF);
+
+#undef C512_ELT
+#undef WROT
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+	sc->h[0x8] ^= p8;
+	sc->h[0x9] ^= p9;
+	sc->h[0xA] ^= pA;
+	sc->h[0xB] ^= pB;
+	sc->h[0xC] ^= pC;
+	sc->h[0xD] ^= pD;
+	sc->h[0xE] ^= pE;
+	sc->h[0xF] ^= pF;
+}
+
+#else
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512( sph_shavite_big_context *sc, const void *msg )
+{
+   __m128i p0, p1, p2, p3, x;
+   __m128i k00, k01, k02, k03, k10, k11, k12, k13;
+   __m128i *m = (__m128i*)msg;
+   __m128i *h = (__m128i*)sc->h;
+   int r;
+
+   p0 = h[0];
+   p1 = h[1];
+   p2 = h[2];
+   p3 = h[3];   
+
+   // round
+   k00 = m[0];
+   x = _mm_xor_si128( p1, k00 );
+   x = _mm_aesenc_si128( x, mm_zero );
+  
+   k01 = m[1];
+   x = _mm_xor_si128( x, k01 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k02 = m[2];
+   x = _mm_xor_si128( x, k02 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k03 = m[3];
+   x = _mm_xor_si128( x, k03 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p0 = _mm_xor_si128( p0, x );
+
+   k10 = m[4];
+   x = _mm_xor_si128( p3, k10 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   
+   k11 = m[5];
+   x = _mm_xor_si128( x, k11 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k12 = m[6];
+   x = _mm_xor_si128( x, k12 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k13 = m[7];
+   x = _mm_xor_si128( x, k13 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p2 = _mm_xor_si128( p2, x );
+
+   for ( r = 0; r < 3; r ++ )
+   {
+      // round 1, 5, 9
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = _mm_xor_si128( k00, k13 ); 
+
+      if ( r == 0 )
+         k00 = _mm_xor_si128( k00, _mm_set_epi32(
+                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
+
+      x = _mm_xor_si128( p0, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = _mm_xor_si128( k01, k00 );
+
+      if ( r == 1 )
+         k01 = _mm_xor_si128( k01, _mm_set_epi32(
+                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = _mm_xor_si128( k02, k01 );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = _mm_xor_si128( k03, k02 );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p3 = _mm_xor_si128( p3, x );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = _mm_xor_si128( k10, k03 );
+
+      x = _mm_xor_si128( p2, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = _mm_xor_si128( k11, k10 );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = _mm_xor_si128( k12, k11 );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = _mm_xor_si128( k13, k12 );
+
+      if ( r == 2 )
+         k13 = _mm_xor_si128( k13, _mm_set_epi32(
+                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p1 = _mm_xor_si128( p1, x );
+
+      // round 2, 6, 10
+
+      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      x = _mm_xor_si128( p3, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      p2 = _mm_xor_si128( p2, x );
+      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      x = _mm_xor_si128( p1, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p0 = _mm_xor_si128( p0, x );
+
+      // round 3, 7, 11
+
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = _mm_xor_si128( k00, k13 );
+
+      x = _mm_xor_si128( p2, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = _mm_xor_si128( k01, k00 );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = _mm_xor_si128( k02, k01 );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = _mm_xor_si128( k03, k02 );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p1 = _mm_xor_si128( p1, x );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = _mm_xor_si128( k10, k03 );
+
+      x = _mm_xor_si128( p0, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = _mm_xor_si128( k11, k10 );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = _mm_xor_si128( k12, k11 );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = _mm_xor_si128( k13, k12 );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p3 = _mm_xor_si128( p3, x );
+
+      // round 4, 8, 12
+
+      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+
+      x = _mm_xor_si128( p1, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p0 = _mm_xor_si128( p0, x );
+      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+
+      x = _mm_xor_si128( p3, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p2 = _mm_xor_si128( p2, x );
+   }
+
+   // round 13
+
+   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+   k00 = _mm_xor_si128( k00, k13 );
+
+   x = _mm_xor_si128( p0, k00 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); 
+   k01 = _mm_xor_si128( k01, k00 );
+
+   x = _mm_xor_si128( x, k01 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+   k02 = _mm_xor_si128( k02, k01 );
+
+   x = _mm_xor_si128( x, k02 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+   k03 = _mm_xor_si128( k03, k02 );
+
+   x = _mm_xor_si128( x, k03 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p3 = _mm_xor_si128( p3, x );
+   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+   k10 = _mm_xor_si128( k10, k03 );
+
+   x = _mm_xor_si128( p2, k10 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+   k11 = _mm_xor_si128( k11, k10 );
+
+   x = _mm_xor_si128( x, k11 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
+               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
+
+   x = _mm_xor_si128( x, k12 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+   k13 = _mm_xor_si128( k13, k12 );
+
+   x = _mm_xor_si128( x, k13 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p1 = _mm_xor_si128( p1, x );
+
+   h[0] = _mm_xor_si128( h[0], p2 );
+   h[1] = _mm_xor_si128( h[1], p3 );
+   h[2] = _mm_xor_si128( h[2], p0 );
+   h[3] = _mm_xor_si128( h[3], p1 );
+}
+
+#endif
+
+static void
+shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
+{
+	memcpy( sc->h, iv, sizeof sc->h );
+	sc->ptr    = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+	sc->count2 = 0;
+	sc->count3 = 0;
+}
+
+static void
+shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data,
+                        size_t len )
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
+				sc->count1 = SPH_T32(sc->count1 + 1);
+				if (sc->count1 == 0) {
+					sc->count2 = SPH_T32(sc->count2 + 1);
+					if (sc->count2 == 0) {
+						sc->count3 = SPH_T32(
+							sc->count3 + 1);
+					}
+				}
+			}
+			c512(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
+                         void *dst, size_t out_size_w32 )
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1, count2, count3;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += SPH_T32(ptr << 3) + n);
+	count1 = sc->count1;
+	count2 = sc->count2;
+	count3 = sc->count3;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 109);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	} else if (ptr < 110) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 110 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 128 - ptr);
+		c512(sc, buf);
+		memset(buf, 0, 110);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	}
+	sph_enc32le(buf + 110, count0);
+	sph_enc32le(buf + 114, count1);
+	sph_enc32le(buf + 118, count2);
+	sph_enc32le(buf + 122, count3);
+	buf[126] = (unsigned char) (out_size_w32 << 5);
+	buf[127] = (unsigned char) (out_size_w32 >> 3);
+	c512(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+void
+sph_shavite512_aesni_init(void *cc)
+{
+	shavite_big_aesni_init(cc, IV512);
+}
+
+void
+sph_shavite512_aesni(void *cc, const void *data, size_t len)
+{
+	shavite_big_aesni_core(cc, data, len);
+}
+
+void
+sph_shavite512_aesni_close(void *cc, void *dst)
+{
+	shavite_big_aesni_close(cc, 0, 0, dst, 16);
+}
+
+void
+sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                        void *dst)
+{
+	shavite_big_aesni_close(cc, ub, n, dst, 16);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 /* see sph_shavite.h */
 void
-sph_shavite512_init(void *cc)
+sph_shavite512_sw_init(void *cc)
 {
 	shavite_big_init(cc, IV512);
 }

 /* see sph_shavite.h */
 void
-sph_shavite512(void *cc, const void *data, size_t len)
+sph_shavite512_sw(void *cc, const void *data, size_t len)
 {
 	shavite_big_core(cc, data, len);
 }

 /* see sph_shavite.h */
 void
-sph_shavite512_close(void *cc, void *dst)
+sph_shavite512_sw_close(void *cc, void *dst)
 {
 	shavite_big_close(cc, 0, 0, dst, 16);
 //	shavite_big_init(cc, IV512);
@@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst)

 /* see sph_shavite.h */
 void
-sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	shavite_big_close(cc, ub, n, dst, 16);
 //	shavite_big_init(cc, IV512);
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -77,9 +77,9 @@ extern "C"{
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
+	unsigned char buf[64] __attribute__ ((aligned (64))); 
+        sph_u32 h[8] __attribute__ ((aligned (32)));
 	size_t ptr;
-	sph_u32 h[8];
 	sph_u32 count0, count1;
 #endif
 } sph_shavite_small_context;
@@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context;
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[128];    /* first field, for alignment */
+	unsigned char buf[128] __attribute__ ((aligned (64))); 
+        sph_u32 h[16] __attribute__ ((aligned (32)));;
 	size_t ptr;
-	sph_u32 h[16];
 	sph_u32 count0, count1, count2, count3;
 #endif
 } sph_shavite_big_context;
@@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst);
 void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-/**
- * Initialize a SHAvite-512 context. This process performs no memory allocation.
- *
- * @param cc   the SHAvite-512 context (pointer to a
- *             <code>sph_shavite512_context</code>)
- */
-void sph_shavite512_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the SHAvite-512 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_shavite512(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current SHAvite-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the SHAvite-512 context
- * @param dst   the destination buffer
- */
-void sph_shavite512_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the SHAvite-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_shavite512_addbits_and_close(
+// Always define sw but only define aesni when available
+// Define fptrs for aesni or sw, not both.
+void sph_shavite512_sw_init(void *cc);
+void sph_shavite512_sw(void *cc, const void *data, size_t len);
+void sph_shavite512_sw_close(void *cc, void *dst);
+void sph_shavite512_sw_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
-	
+
+#ifdef __AES__
+void sph_shavite512_aesni_init(void *cc);
+void sph_shavite512_aesni(void *cc, const void *data, size_t len);
+void sph_shavite512_aesni_close(void *cc, void *dst);
+void sph_shavite512_aesni_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#define sph_shavite512_init  sph_shavite512_aesni_init
+#define sph_shavite512       sph_shavite512_aesni
+#define sph_shavite512_close sph_shavite512_aesni_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_aesni_addbits_and_close
+
+#else
+
+#define sph_shavite512_init  sph_shavite512_sw_init
+#define sph_shavite512       sph_shavite512_sw
+#define sph_shavite512_close sph_shavite512_sw_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_sw_addbits_and_close
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/veltor.c
+++ b/algo/veltor.c
@@ -104,7 +104,7 @@ int scanhash_veltor(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

 bool register_veltor_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT; 
+    gate->optimizations = SSE2_OPT | AES_OPT; 
    init_veltor_ctx();
    gate->scanhash  = (void*)&scanhash_veltor;
    gate->hash      = (void*)&veltorhash;
--- a/algo/whirlpool/md_helper.c
+++ b/algo/whirlpool/md_helper.c
@@ -252,8 +252,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
 	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
 #endif

-uint64_t *b= (uint64_t*)sc->buf;
-uint64_t *s= (uint64_t*)sc->state;
+//uint64_t *b= (uint64_t*)sc->buf;
+//uint64_t *s= (uint64_t*)sc->state;
 // printf("Sptr 1= %u\n",current);   
 // printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
 // printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
--- a/algo/whirlpool/whirlpool-4way.c
+++ b/algo/whirlpool/whirlpool-4way.c
@@ -1,4 +1,7 @@
 #include "whirlpool-gate.h"
+
+#if defined(__AVX2__)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -6,8 +9,6 @@
 #include "sph_whirlpool.h"
 #include "whirlpool-hash-4way.h"

-#if defined(__AVX2__)
-
 static __thread whirlpool_4way_context whirl_mid;

 void whirlpool_hash_4way( void *state, const void *input )
@@ -50,7 +51,7 @@ void whirlpool_hash_4way( void *state, const void *input )
 }

 int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
-                             unsigned long *hashes_done )
+                             uint64_t *hashes_done )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -67,8 +68,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
   uint32_t *noncep2 = vdata + 77;
   uint32_t *noncep3 = vdata + 79;

-//   if (opt_benchmark)
-//      ((uint32_t*)ptarget)[7] = 0x0000ff;
+   if (opt_benchmark)
+      ((uint32_t*)ptarget)[7] = 0x0000ff;

    for (int i=0; i < 19; i++)
      be32enc(&endiandata[i], pdata[i]);
--- a/algo/whirlpool/whirlpool-gate.c
+++ b/algo/whirlpool/whirlpool-gate.c
@@ -2,14 +2,15 @@

 bool register_whirlpool_algo( algo_gate_t* gate )
 {
-//#if defined (WHIRLPOOL_4WAY)
-//  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
-//  gate->hash      = (void*)&whirlpool_hash_4way;
-//#else
+#if defined (WHIRLPOOL_4WAY)
+  four_way_not_tested();
+  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
+  gate->hash      = (void*)&whirlpool_hash_4way;
+#else
  gate->scanhash  = (void*)&scanhash_whirlpool;
  gate->hash      = (void*)&whirlpool_hash;
  init_whirlpool_ctx();
-//#endif
+#endif
  return true;
 };

--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -8,13 +8,13 @@
  #define WHIRLPOOL_4WAY
 #endif

-//#if defined (WHIRLPOOL_4WAY) 
+#if defined (WHIRLPOOL_4WAY) 

-//void whirlpool_hash_4way(void *state, const void *input);
+void whirlpool_hash_4way(void *state, const void *input);

-//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
-//                              uint64_t *hashes_done );
-//#endif
+int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                              uint64_t *hashes_done );
+#else

 void whirlpool_hash( void *state, const void *input );

@@ -22,3 +22,4 @@ int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done );
 #endif

+#endif
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3365,6 +3365,8 @@ do { \
 // scalar array of constants "table" and return referenced 64 bit entry.
 #define t_lane( table, inv, row, lane ) \
   table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
+//   table[ t_rwo( inv, row )[ lane ] ];
+

 // Build a vector from elements of non-contiguous 64 bit data extracted from
 // scalar "table".
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -0,0 +1,18 @@
+#include "x11-gate.h"
+
+bool register_x11_algo( algo_gate_t* gate )
+{
+#if defined (X11_4WAY)
+  init_x11_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11_4way;
+  gate->hash      = (void*)&x11_hash_4way;
+#else
+  init_x11_ctx();
+  gate->scanhash  = (void*)&scanhash_x11;
+  gate->hash      = (void*)&x11_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -0,0 +1,30 @@
+#ifndef X11_GATE_H__
+#define X11_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+//#if defined(HASH_4WAY) && !defined(NO_AES_NI)
+//  #define X11_4WAY
+//#endif
+
+bool register_x11_algo( algo_gate_t* gate );
+
+#if defined(X11_4WAY)
+
+void x11_hash_4way( void *state, const void *input );
+
+int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#endif
+
+void x11_hash( void *state, const void *input );
+
+int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11_ctx();
+
+#endif
+
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "x11-gate.h"

 #include <string.h>
 #include <stdint.h>
@@ -61,7 +61,7 @@ void init_x11_ctx()
 #endif
 }

-static void x11_hash( void *state, const void *input )
+void x11_hash( void *state, const void *input )
 {
     unsigned char hash[128] __attribute__ ((aligned (32)));
     unsigned char hashbuf[128] __attribute__ ((aligned (16)));
@@ -189,7 +189,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
        pdata[19] = n;
        return 0;
 }
-
+/*
 bool register_x11_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
@@ -199,4 +199,4 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
-
+*/
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -440,7 +440,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )

 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | AVX_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;