v3.11.3

2026-07-15 03:16:49 +00:00 · 2020-01-10 20:37:47 -05:00
parent 70089d1224
commit f990f6a702
64 changed files with 1475 additions and 1801 deletions
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -7,33 +7,44 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl256-hash-4way.h"
+#endif

-#if defined (ALLIUM_8WAY)  
+#if defined (ALLIUM_16WAY)  

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_16way_context     blake;
   keccak256_8way_context    keccak;
   cube_4way_context          cube;
   skein256_8way_context     skein;
+#if defined(__VAES__)
+   groestl256_4way_context groestl;
+#else
   hashState_groestl256      groestl;
-} allium_8way_ctx_holder;
+#endif
+} allium_16way_ctx_holder;

-static __thread allium_8way_ctx_holder allium_8way_ctx;
+static __thread allium_16way_ctx_holder allium_16way_ctx;

-bool init_allium_8way_ctx()
+bool init_allium_16way_ctx()
 {
-   keccak256_8way_init( &allium_8way_ctx.keccak );
-   cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &allium_8way_ctx.skein );
-   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   keccak256_8way_init( &allium_16way_ctx.keccak );
+   cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &allium_16way_ctx.skein );
+#if defined(__VAES__)
+   groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
+#else
+   init_groestl256( &allium_16way_ctx.groestl, 32 );
+#endif
   return true;
 }

-void allium_8way_hash( void *state, const void *input )
+void allium_16way_hash( void *state, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
-   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (64)));
   uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -42,18 +53,39 @@ void allium_8way_hash( void *state, const void *input )
   uint32_t hash5[8] __attribute__ ((aligned (64)));
   uint32_t hash6[8] __attribute__ ((aligned (64)));
   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
+   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16way_close( &ctx.blake, vhash );

-   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                  vhash, 256 );
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );
+   
+//   rintrlv_8x32_8x64( vhashA, vhash, 256 );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8way_close( &ctx.keccak, vhashA);
+   keccak256_8way_init( &ctx.keccak );
+   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8way_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );

   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
@@ -67,6 +99,18 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
  
   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
@@ -78,6 +122,17 @@ void allium_8way_hash( void *state, const void *input )
   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );

+   intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
+   intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
+
   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash0, hash1, vhash, 256 );
@@ -90,15 +145,258 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );

-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                hash7, 256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );
+
+   skein256_8way_update( &ctx.skein, vhashA, 32 );
+   skein256_8way_close( &ctx.skein, vhashA );
+   skein256_8way_init( &ctx.skein );
+   skein256_8way_update( &ctx.skein, vhashB, 32 );
+   skein256_8way_close( &ctx.skein, vhashB );

-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );
+
+#if defined(__VAES__)
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+   
+   dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+ 
+   dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
+   
+#else
+
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+256, hash8, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+288, hash9, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+320, hash10, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+352, hash11, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+384, hash12, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+416, hash13, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+448, hash14, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+480, hash15, 256 );
+
+#endif
+}
+
+int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   blake256_16way_init( &allium_16way_ctx.blake );
+   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
+
+   do {
+     *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                 n+11, n+10, n+ 9, n+ 8,
+                                                 n+ 7, n+ 6, n+ 5, n+ 4,
+                                                 n+ 3, n+ 2, n +1, n ) );
+
+     allium_16way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 16; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     {
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
+     }
+     n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (ALLIUM_8WAY)  
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   hashState_groestl256      groestl;
+
+} allium_8way_ctx_holder;
+
+static __thread allium_8way_ctx_holder allium_8way_ctx;
+
+bool init_allium_8way_ctx()
+{
+   keccak256_4way_init( &allium_8way_ctx.keccak );
+   cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &allium_8way_ctx.skein );
+   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_8way_hash( void *state, const void *input )
+{
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (32)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 
+
+   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhashA );
+
+   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                     vhashA, 256 );
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashA );
+   keccak256_4way_init( &ctx.keccak );
+   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashB );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   skein256_4way_update( &ctx.skein, vhashA, 32 );
+   skein256_4way_close( &ctx.skein, vhashA );
+   skein256_4way_init( &ctx.skein );
+   skein256_4way_update( &ctx.skein, vhashB, 32 );
+   skein256_4way_close( &ctx.skein, vhashB );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
@@ -122,23 +420,21 @@ void allium_8way_hash( void *state, const void *input )
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
 }

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -169,126 +465,4 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-
-#elif defined (ALLIUM_4WAY)  
-
-
-typedef struct {
-   blake256_4way_context     blake;
-   keccak256_4way_context    keccak;
-   cubehashParam             cube;
-   skein256_4way_context     skein;
-   hashState_groestl256      groestl;
-
-} allium_4way_ctx_holder;
-
-static __thread allium_4way_ctx_holder allium_4way_ctx;
-
-bool init_allium_4way_ctx()
-{
-   keccak256_4way_init( &allium_4way_ctx.keccak );
-   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &allium_4way_ctx.skein );
-   init_groestl256( &allium_4way_ctx.groestl, 32 );
-   return true;
-}
-
-void allium_4way_hash( void *state, const void *input )
-{
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (32)));
-   uint32_t hash2[8] __attribute__ ((aligned (32)));
-   uint32_t hash3[8] __attribute__ ((aligned (32)));
-   uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
-   uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
-   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
-
-   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
-   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash32 );
-
-   rintrlv_4x32_4x64( vhash64, vhash32, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
-
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
-
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
-
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-
-   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
-
-   skein256_4way_update( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
-
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
-
-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
-}
-
-int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
-
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256_4way_init( &allium_4way_ctx.blake );
-   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
-
-   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
-
-     allium_4way_hash( hash, vdata );
-     pdata[19] = n;
-
-     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
-     {
-        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
-         }
-     }
-     n += 4;
-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
-   return 0;
-}
-
 #endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -78,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -119,7 +119,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -146,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -165,7 +165,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -174,20 +174,20 @@ bool register_lyra2h_algo( algo_gate_t* gate )

 bool register_allium_algo( algo_gate_t* gate )
 {
-#if defined (ALLIUM_8WAY)
+#if defined (ALLIUM_16WAY)
+  gate->miner_thread_init = (void*)&init_allium_16way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_16way;
+  gate->hash      = (void*)&allium_16way_hash;
+#elif defined (ALLIUM_8WAY)
  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_8way;
  gate->hash      = (void*)&allium_8way_hash;
-#elif defined (ALLIUM_4WAY)
-  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
-  gate->scanhash  = (void*)&scanhash_allium_4way;
-  gate->hash      = (void*)&allium_4way_hash;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -229,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_phi2_algo( algo_gate_t* gate )
 {
 //   init_phi2_ctx();
-   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -153,27 +153,27 @@ bool lyra2h_thread_init();
 //////////////////////////////////

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define ALLIUM_8WAY 1
+  #define ALLIUM_16WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY 1
+  #define ALLIUM_8WAY 1
 #endif

 bool register_allium_algo( algo_gate_t* gate );

-#if defined(ALLIUM_8WAY)
+#if defined(ALLIUM_16WAY)
+
+void allium_16way_hash( void *state, const void *input );
+int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool init_allium_16way_ctx();
+
+#elif defined(ALLIUM_8WAY)

 void allium_8way_hash( void *state, const void *input );
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_8way_ctx();

-#elif defined(ALLIUM_4WAY)
-
-void allium_4way_hash( void *state, const void *input );
-int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_4way_ctx();
-
 #else

 void allium_hash( void *state, const void *input );
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -575,4 +575,138 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
   return 0;
 }

+int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
+                  const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+   memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
+
+   uint64_t *ptrWord = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa0 = (rowa0 + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa0 == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way_X( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   //================== Freeing the memory =============================//
+   _mm_free(wholeMatrix);
+
+   return 0;
+}
+      
 #endif
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -74,6 +74,9 @@ int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
          uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
 #endif

 #endif /* LYRA2_H_ */
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -246,15 +246,32 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
-                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+// reduced duplex row has three version depending on rows inout.
+// If they are the same the fastest version can be used, equivalent to 
+// linear version.
+// If either rowinout overlaps with rowout the slowest version is used,
+// to refresh local data after overwriting rowout.
+// Otherwise the normal version is used, slower than unified, faster than
+// overlap.
+//
+// The likelyhood of each case depends on the number of rows. More rows
+// means unified and overlap are both less likely.
+// Unified has a 1 in Nrows chances,
+// Overlap has 2 in Nrows chance reduced to 1 in Nrows because if both
+// overlap it's unified.
+// As a result normal is Nrows-2 / Nrows.
+// for 4 rows: 1 unified, 1 overlap, 2 normal.
+// for 8 rows: 1 unified, 1 overlap, 6 normal.
+
+static inline void reducedDuplexRow_2way_normal( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols)
 {
   int i;
   register __m512i state0, state1, state2, state3;
   __m512i *in = (__m512i*)rowIn;
-   __m256i *inout0 = (__m256i*)rowInOut0;
-   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *inout0 = (__m512i*)rowInOut0;
+   __m512i *inout1 = (__m512i*)rowInOut1;
   __m512i *out = (__m512i*)rowOut;
   register __m512i io0, io1, io2;

@@ -262,19 +279,19 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
-    
+
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
     io0 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 ),
-                 _mm512_load_si512( (__m512i*)inout1 ) );
+                                    _mm512_load_si512( (__m512i*)inout0 ),
+                                    _mm512_load_si512( (__m512i*)inout1 ) );
     io1 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 +1 ),
-                 _mm512_load_si512( (__m512i*)inout1 +1 ) );
+                                    _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                    _mm512_load_si512( (__m512i*)inout1 +1 ) );
     io2 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 +2 ),
-                 _mm512_load_si512( (__m512i*)inout1 +2 ) );
+                                    _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                    _mm512_load_si512( (__m512i*)inout1 +2 ) );

     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
@@ -286,29 +303,6 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
     {
       register __m512i t0, t1, t2;

-       //M[rowOut][col] = M[rowOut][col] XOR rand
-       t0 = _mm512_xor_si512( out[0], state0 );
-       t1 = _mm512_xor_si512( out[1], state1 );
-       t2 = _mm512_xor_si512( out[2], state2 );
-
-       // if out is the same row as inout, update with new data.
-       if ( rowOut == rowInOut0 )
-       {
-          io0 = _mm512_mask_blend_epi64( 0x0f, io0, t0 );
-          io1 = _mm512_mask_blend_epi64( 0x0f, io1, t1 );
-          io2 = _mm512_mask_blend_epi64( 0x0f, io2, t2 );
-       }
-       if ( rowOut == rowInOut1 )
-       {
-          io0 = _mm512_mask_blend_epi64( 0xf0, io0, t0 );
-          io1 = _mm512_mask_blend_epi64( 0xf0, io1, t1 );
-          io2 = _mm512_mask_blend_epi64( 0xf0, io2, t2 );
-       }
-
-       out[0] = t0;
-       out[1] = t1;
-       out[2] = t2;
-
       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
       t0 = _mm512_permutex_epi64( state0, 0x93 );
       t1 = _mm512_permutex_epi64( state1, 0x93 );
@@ -317,19 +311,24 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
       io0 = _mm512_xor_si512( io0, _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
       io1 = _mm512_xor_si512( io1, _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
       io2 = _mm512_xor_si512( io2, _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+       //M[rowOut][col] = M[rowOut][col] XOR rand
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
     }

-     _mm512_mask_store_epi64( (__m512i*)inout0,    0x0f, io0 );
-     _mm512_mask_store_epi64( (__m512i*)inout1,    0xf0, io0 );
-     _mm512_mask_store_epi64( (__m512i*)inout0 +1, 0x0f, io1 );
-     _mm512_mask_store_epi64( (__m512i*)inout1 +1, 0xf0, io1 );
-     _mm512_mask_store_epi64( (__m512i*)inout0 +2, 0x0f, io2 );
-     _mm512_mask_store_epi64( (__m512i*)inout1 +2, 0xf0, io2 );
+     _mm512_mask_store_epi64( inout0,    0x0f, io0 );
+     _mm512_mask_store_epi64( inout1,    0xf0, io0 );
+     _mm512_mask_store_epi64( inout0 +1, 0x0f, io1 );
+     _mm512_mask_store_epi64( inout1 +1, 0xf0, io1 );
+     _mm512_mask_store_epi64( inout0 +2, 0x0f, io2 );
+     _mm512_mask_store_epi64( inout1 +2, 0xf0, io2 );

      //Goes to next block
      in     += BLOCK_LEN_M256I;
-      inout0 += BLOCK_LEN_M256I * 2;
-      inout1 += BLOCK_LEN_M256I * 2;
+      inout0 += BLOCK_LEN_M256I;
+      inout1 += BLOCK_LEN_M256I;
      out    += BLOCK_LEN_M256I;
   }

@@ -339,4 +338,297 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
   _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

+
+
+// rowInOut0 ! = rowInOut1 != rowOut
+static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
+{
+
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m512i *inout0 = (__m512i*)rowInOut0;
+   __m512i *inout1 = (__m512i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   inout_ovly io;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+    
+   for ( i = 0; i < nCols; i++ )
+   {
+     //Absorbing "M[prev] [+] M[row*]"
+     io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 ),
+                                  _mm512_load_si512( (__m512i*)inout1 ) );
+     io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
+     io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
+
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
+
+     //Applies the reduced-round transformation f to the sponge's state
+     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+     {
+       __m512i t0, t1, t2;
+
+       //M[rowOut][col] = M[rowOut][col] XOR rand
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
+
+       // if out is the same row as inout, update with new data.
+       if ( rowOut == rowInOut0 )
+       {
+          io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
+          io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
+          io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
+
+       }
+       if ( rowOut == rowInOut1 )
+       {
+          io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
+          io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
+          io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
+       }
+
+       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       io.v512[0] = _mm512_xor_si512( io.v512[0],
+                                 _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+       io.v512[1] = _mm512_xor_si512( io.v512[1],
+                                 _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+       io.v512[2] = _mm512_xor_si512( io.v512[2],
+                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+     }
+
+     _mm512_mask_store_epi64( inout0,    0x0f, io.v512[0] );
+     _mm512_mask_store_epi64( inout1,    0xf0, io.v512[0] );
+     _mm512_mask_store_epi64( inout0 +1, 0x0f, io.v512[1] );
+     _mm512_mask_store_epi64( inout1 +1, 0xf0, io.v512[1] );
+     _mm512_mask_store_epi64( inout0 +2, 0x0f, io.v512[2] );
+     _mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] );
+
+      //Goes to next block
+      in     += BLOCK_LEN_M256I;
+      inout0 += BLOCK_LEN_M256I;
+      inout1 += BLOCK_LEN_M256I;
+      out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+static inline void reducedDuplexRow_2way_overlap_X( uint64_t *State,
+                    uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
+                    uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m256i *inout0 = (__m256i*)rowInOut0;
+   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   inout_ovly inout;
+   __m512i t0, t1, t2;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+
+      //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];
+
+      state0 = _mm512_xor_si512( state0,
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
+      state1 = _mm512_xor_si512( state1,
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
+      state2 = _mm512_xor_si512( state2,
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
+
+
+      //Applies the reduced-round transformation f to the sponge's state
+      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+      //M[rowOut][col] = M[rowOut][col] XOR rand
+      out[0] = _mm512_xor_si512( out[0], state0 );
+      out[1] = _mm512_xor_si512( out[1], state1 );
+      out[2] = _mm512_xor_si512( out[2], state2 );
+
+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = ( (__m256i*)out )[0];
+         inout.v256[2] = ( (__m256i*)out )[2];
+         inout.v256[4] = ( (__m256i*)out )[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = ( (__m256i*)out )[1];
+         inout.v256[3] = ( (__m256i*)out )[3];
+         inout.v256[5] = ( (__m256i*)out )[5];
+      }
+
+      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+      t0 = _mm512_permutex_epi64( state0, 0x93 );
+      t1 = _mm512_permutex_epi64( state1, 0x93 );
+      t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];
+
+       //Goes to next block
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// rowInOut0 == rowInOut1, fastest, least likely: 1 / nrows
+static inline void reducedDuplexRow_2way_unified( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0,
+                            uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m512i *inout = (__m512i*)rowInOut0;
+   __m512i *out = (__m512i*)rowOut;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+   for ( i = 0; i < nCols; i++ )
+   {
+     //Absorbing "M[prev] [+] M[row*]"
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], inout[0] ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], inout[1] ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], inout[2] ) );
+
+     //Applies the reduced-round transformation f to the sponge's state
+     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+     {
+       register __m512i t0, t1, t2;
+
+       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       inout[0] = _mm512_xor_si512( inout[0],
+                                    _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+       inout[1] = _mm512_xor_si512( inout[1],
+                                    _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+       inout[2] = _mm512_xor_si512( inout[2],
+                                     _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
+
+     }
+
+     //Goes to next block
+     in     += BLOCK_LEN_M256I;
+     inout += BLOCK_LEN_M256I;
+     out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// Multi level specialization.
+// There are three cases that need to be handled:
+// unified: inout data is contiguous, fastest, unlikely.
+// normal: inout data is not contiguous with no overlap with out, likely. 
+// overlap: inout data is not contiguous and one lane overlaps with out
+//          slowest, unlikely.
+//
+// In adition different algos prefer different coding. x25x and x22i prefer
+// 256 bit memory acceses to handle the diverged data while all other
+// algos prefer 512 bit memory accesses with masking and blending.
+
+ 
+//  Wrapper
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols )
+{
+  if ( rowInOut0 == rowInOut1 )
+     reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
+  else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+     reducedDuplexRow_2way_overlap( State, rowIn, rowInOut0, rowInOut1,
+                                    rowOut, nCols );
+  else
+     reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
+                                   rowOut, nCols );
+}
+
+inline void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols )
+{
+    if ( rowInOut0 == rowInOut1 )
+      reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
+    else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+    {
+       asm ( "nop" );  // This prevents GCC from merging with previous function
+       reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1,
+                                      rowOut, nCols );
+    }
+    else
+      reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
+                                    rowOut, nCols );
+}
+
+
 #endif // AVX512
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -203,13 +203,12 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-union _povly
+union _inout_ovly
 {
-   __m512i *v512;
-   __m256i *v256;
-   uint64_t *u64;
+   __m512i v512[3];
+   __m256i v256[6];
 };
-typedef union _povly povly;
+typedef union _inout_ovly inout_ovly;

 //---- Housekeeping
 void initState_2way( uint64_t State[/*16*/] );
@@ -234,6 +233,10 @@ void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
                            uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols);

+void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
+                              uint64_t *rowInOut0, uint64_t *rowInOut1,
+                              uint64_t *rowOut, uint64_t nCols);
+
 #endif