v25.4

2025-09-17 23:44:27 +00:00 · 2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -26,9 +26,9 @@
 #if defined (ALLIUM_16WAY)  

 typedef union {
-   keccak256_8way_context    keccak;
+   keccak256_8x64_context    keccak;
   cube_4way_2buf_context    cube;
-   skein256_8way_context     skein;
+   skein256_8x64_context     skein;
 #if defined(__VAES__)
   groestl256_4way_context   groestl;
 #else
@@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+   blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );
   
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashA);
-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_8way_close( &ctx.keccak, vhashB);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashA);
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
                hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashA, 32 );
-   skein256_8way_close( &ctx.skein, vhashA );
-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhashB, 32 );
-   skein256_8way_close( &ctx.skein, vhashB );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashA, 32 );
+   skein256_8x64_close( &ctx.skein, vhashA );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhashB, 32 );
+   skein256_8x64_close( &ctx.skein, vhashB );

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
 #elif defined (ALLIUM_8WAY)  

 typedef union {
-   keccak256_4way_context    keccak;
+   keccak256_4x64_context    keccak;
   cube_2way_context         cube;
-   skein256_4way_context     skein;
+   skein256_4x64_context     skein;
 #if defined(__VAES__)
   groestl256_2way_context   groestl;
 #else
@@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashA );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
-   keccak256_4way_close( &ctx.keccak, vhashB );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashA );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhashB );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashA, 32 );
-   skein256_4way_close( &ctx.skein, vhashA );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhashB, 32 );
-   skein256_4way_close( &ctx.skein, vhashB );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashA, 32 );
+   skein256_4x64_close( &ctx.skein, vhashA );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhashB, 32 );
+   skein256_4x64_close( &ctx.skein, vhashB );

 #if defined(__VAES__)

@@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                                     n+ 3, n+ 2, n+ 1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -483,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash3 = (uint64_t*)hash+12;
   allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );

   intrlv_2x64( vhashA, hash0, hash1, 256 );
@@ -588,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

      // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -616,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
 //
 //  1 way

-
 typedef struct 
 {
        blake256_context        blake;
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init()
 return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2h_4way_blake_mid;
+static __thread blake256_4x32_context l2h_4way_blake_mid;

 void lyra2h_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2h_4way_blake_mid );
+       blake256_4x32_update( &l2h_4way_blake_mid, input, 64 );
 }

 void lyra2h_4way_hash( void *state, const void *input )
@@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input )
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+     blake256_4x32_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
+     blake256_4x32_update( &ctx_blake, input + (64*4), 16 );
+     blake256_4x32_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,25 +7,24 @@
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/cubehash/cube-hash-2way.h"

-
 #if defined (LYRA2REV2_16WAY)

 typedef struct {
-   blake256_16way_context    blake;
-   keccak256_8way_context    keccak;
+   blake256_16x32_context    blake;
+   keccak256_8x64_context    keccak;
   cubehashParam             cube;
-   skein256_8way_context     skein;
-   bmw256_16way_context      bmw;
+   skein256_8x64_context     skein;
+   bmw256_16x32_context      bmw;
 } lyra2v2_16way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_16way_ctx_holder l2v2_16way_ctx;

 bool init_lyra2rev2_16way_ctx()
 {
-   keccak256_8way_init( &l2v2_16way_ctx.keccak );
+   keccak256_8x64_init( &l2v2_16way_ctx.keccak );
   cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &l2v2_16way_ctx.skein );
-   bmw256_16way_init( &l2v2_16way_ctx.bmw );
+   skein256_8x64_init( &l2v2_16way_ctx.skein );
+   bmw256_16x32_init( &l2v2_16way_ctx.bmw );
   return true;
 }

@@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0,  hash1,  hash2,  hash3,
                  hash4,  hash5,  hash6,  hash7,
@@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input )
   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11,
                       hash12, hash13, hash14, hash15, 256 );

-   keccak256_8way_init( &ctx.keccak );
-   keccak256_8way_update( &ctx.keccak, vhash, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8x64_init( &ctx.keccak );
+   keccak256_8x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_8x64_close( &ctx.keccak, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10,  hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
@@ -122,21 +121,20 @@ void lyra2rev2_16way_hash( void *state, const void *input )

   intrlv_8x64( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_8x64( vhash, hash8,  hash9,  hash10, hash11, hash12,
                       hash13, hash14, hash15, 256 );

-   skein256_8way_init( &ctx.skein );
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
+   skein256_8x64_init( &ctx.skein );
+   skein256_8x64_update( &ctx.skein, vhash, 32 );
+   skein256_8x64_close( &ctx.skein, vhash );

   dintrlv_8x64( hash8,  hash9,  hash10, hash11,
                 hash12, hash13, hash14, hash15, vhash, 256 );
-
   
   cubehash_full( &ctx.cube, (byte*) hash0,  256, (const byte*) hash0, 32 );
   cubehash_full( &ctx.cube, (byte*) hash1,  256, (const byte*) hash1, 32 );
@@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input )
                        hash8,  hash9,  hash10, hash11,
                        hash12, hash13, hash14, hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
@@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n );
-   blake256_16way_init( &l2v2_16way_ctx.blake );
-   blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v2_16way_ctx.blake );
+   blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_8x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_8way_context       bmw;
+   skein256_4x64_context     skein;
+   bmw256_8x32_context       bmw;
 } lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));

 static lyra2v2_8way_ctx_holder l2v2_8way_ctx;

 bool init_lyra2rev2_8way_ctx()
 {
-   keccak256_4way_init( &l2v2_8way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_8way_ctx.keccak );
   cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_8way_ctx.skein );
-   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   skein256_4x64_init( &l2v2_8way_ctx.skein );
+   bmw256_8x32_init( &l2v2_8way_ctx.bmw );
   return true;
 }

@@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                 hash4, hash5, hash6, hash7, vhash, 256 );

   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   keccak256_4way_init( &ctx.keccak );
-   keccak256_4way_update( &ctx.keccak, vhash, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash );
+   keccak256_4x64_init( &ctx.keccak );
+   keccak256_4x64_update( &ctx.keccak, vhash, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
   
   intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 );
-   skein256_4way_init( &ctx.skein );
-   skein256_4way_update( &ctx.skein, vhash, 32 );
-   skein256_4way_close( &ctx.skein, vhash );
+   skein256_4x64_init( &ctx.skein );
+   skein256_4x64_update( &ctx.skein, vhash, 32 );
+   skein256_4x64_close( &ctx.skein, vhash );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 );

   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
@@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
@@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v2_8way_ctx.blake );
-   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v2_8way_ctx.blake );
+   blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV2_4WAY)

 typedef struct {
-   blake256_4way_context     blake;
-   keccak256_4way_context    keccak;
+   blake256_4x32_context     blake;
+   keccak256_4x64_context    keccak;
   cubehashParam             cube;
-   skein256_4way_context     skein;
-   bmw256_4way_context          bmw;
+   skein256_4x64_context     skein;
+   bmw256_4x32_context          bmw;
 } lyra2v2_4way_ctx_holder;

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;

 bool init_lyra2rev2_4way_ctx()
 {
-   keccak256_4way_init( &l2v2_4way_ctx.keccak );
+   keccak256_4x64_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &l2v2_4way_ctx.skein );
-   bmw256_4way_init( &l2v2_4way_ctx.bmw );
+   skein256_4x64_init( &l2v2_4way_ctx.skein );
+   bmw256_4x32_init( &l2v2_4way_ctx.bmw );
   return true;
 }

@@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64<<2), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );

   rintrlv_4x32_4x64( vhash64, vhash, 256 );

-   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
+   keccak256_4x64_update( &ctx.keccak, vhash64, 32 );
+   keccak256_4x64_close( &ctx.keccak, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );

-   skein256_4way_update( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
+   skein256_4x64_update( &ctx.skein, vhash64, 32 );
+   skein256_4x64_close( &ctx.skein, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

@@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -451,8 +449,8 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

   v128_bswap32_intrlv80_4x32( vdata, pdata );

-   blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v2_4way_ctx.blake );
+   blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -9,18 +9,18 @@
 #if defined (LYRA2REV3_16WAY)

 typedef struct {
-   blake256_16way_context     blake;
+   blake256_16x32_context     blake;
   cube_4way_context          cube;
-   bmw256_16way_context       bmw;
+   bmw256_16x32_context       bmw;
 } lyra2v3_16way_ctx_holder;

 static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;

 bool init_lyra2rev3_16way_ctx()
 {
-   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
-   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   bmw256_16x32_init( &l2v3_16way_ctx.bmw );
   return true;
 }

@@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );

-   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
-   blake256_16way_close( &ctx.blake, vhash );
+   blake256_16x32_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16x32_close( &ctx.blake, vhash );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input )
             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
             hash15, 256 );

-   bmw256_16way_update( &ctx.bmw, vhash, 32 );
-   bmw256_16way_close( &ctx.bmw, state );
+   bmw256_16x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_16x32_close( &ctx.bmw, state );
 }


@@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,

   mm512_bswap32_intrlv80_16x32( vdata, pdata );

-   blake256_16way_init( &l2v3_16way_ctx.blake );
-   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+   blake256_16x32_init( &l2v3_16way_ctx.blake );
+   blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 );

   do
   {
@@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
 #elif defined (LYRA2REV3_8WAY)

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_8x32_context     blake;
   cubehashParam             cube;
-   bmw256_8way_context       bmw;
+   bmw256_8x32_context       bmw;
 } lyra2v3_8way_ctx_holder;

 static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx;

 bool init_lyra2rev3_8way_ctx()
 {
-   blake256_8way_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
   cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 );
-   bmw256_8way_init( &l2v3_8way_ctx.bmw );
+   bmw256_8x32_init( &l2v3_8way_ctx.bmw );
   return true;
 }

@@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   blake256_8x32_update( &ctx.blake, input + (64*8), 16 );
+   blake256_8x32_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
                       hash4, hash5, hash6, hash7, vhash, 256 );
@@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way_update( &ctx.bmw, vhash, 32 );
-   bmw256_8way_close( &ctx.bmw, state );
+   bmw256_8x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_8x32_close( &ctx.bmw, state );

   }

@@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8x32_init( &l2v3_8way_ctx.blake );
+   blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
 #if defined (LYRA2REV3_4WAY)  

 typedef struct {
-   blake256_4way_context     blake;
+   blake256_4x32_context     blake;
   cubehashParam             cube;
-   bmw256_4way_context       bmw;
+   bmw256_4x32_context       bmw;
 } lyra2v3_4way_ctx_holder;

-//static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
 static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx;

 bool init_lyra2rev3_4way_ctx()
 {
-   blake256_4way_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
   cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
-   bmw256_4way_init( &l2v3_4way_ctx.bmw );
+   bmw256_4x32_init( &l2v3_4way_ctx.bmw );
   return true;
 }

@@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
-   blake256_4way_close( &ctx.blake, vhash );
+   blake256_4x32_update( &ctx.blake, input + (64*4), 16 );
+   blake256_4x32_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way_update( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, state );
+   bmw256_4x32_update( &ctx.bmw, vhash, 32 );
+   bmw256_4x32_close( &ctx.bmw, state );
 }

 int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
@@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   v128_bswap32_intrlv80_4x32( vdata, pdata );
   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );

-   blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4x32_init( &l2v3_4way_ctx.blake );
+   blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
    uint32_t hash14[8] __attribute__ ((aligned (32)));
    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+    blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));

-     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+     blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
            _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
     lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init()
 return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

-static __thread blake256_4way_context l2z_4way_blake_mid;
+static __thread blake256_4x32_context l2z_4way_blake_mid;

 void lyra2z_4way_midstate( const void* input )
 {
-       blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
+       blake256_4x32_init( &l2z_4way_blake_mid );
+       blake256_4x32_update( &l2z_4way_blake_mid, input, 64 );
 }

 void lyra2z_4way_hash( void *hash, const void *midstate_vars,
@@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars,
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-//     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-     blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
-
-/*
-     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
-     blake256_4way_close( &ctx_blake, vhash );
-*/
+     blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
   block_buf[15] = v128_32( 640 );

   // Partialy prehash second block without touching nonces
-   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
      lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
@@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #if defined(LYRA2Z_16WAY)
  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_16way;
-//  gate->hash       = (void*)&lyra2z_16way_hash;
 #elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_8way;
-//  gate->hash       = (void*)&lyra2z_8way_hash;
 #elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2z_4way;
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -45,7 +45,7 @@ static const uint64_t blake2b_IV[8] =

 #if defined(SIMD512)

-#define G2W_4X64(a,b,c,d) \
+#define G2W(a,b,c,d) \
   a = _mm512_add_epi64( a, b ); \
   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
   c = _mm512_add_epi64( c, d ); \
@@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] =
   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );

 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shufll256_64( s0 ); \
-   s3 = mm512_swap256_128( s3); \
+   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shuflr256_64( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
+   G2W( s0, s1, s2, s3 ); \
   s0 = mm512_shuflr256_64( s0 ); \
   s3 = mm512_swap256_128( s3 ); \
   s2 = mm512_shufll256_64( s2 ); 

-/*
-#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shufll256_64( s3 ); \
-   s1 = mm512_shuflr256_64( s1); \
-   s2 = mm512_swap256_128( s2 ); \
-   G2W_4X64( s0, s1, s2, s3 ); \
-   s3 = mm512_shuflr256_64( s3 ); \
-   s1 = mm512_shufll256_64( s1 ); \
-   s2 = mm512_swap256_128( s2 ); 
-*/
-
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] =

 #if defined(__AVX2__)

-#define G_4X64(a,b,c,d) \
+#define G_AVX2(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
@@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] =

 // Pivot about s1 instead of s0 reduces latency.
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shufll_64( s0 ); \
-   s3 = mm256_swap_128( s3); \
+   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shuflr_64( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
+   G_AVX2( s0, s1, s2, s3 ); \
   s0 = mm256_shuflr_64( s0 ); \
   s3 = mm256_swap_128( s3 ); \
   s2 = mm256_shufll_64( s2 );

-/*
-#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shufll_64( s3 ); \
-   s1 = mm256_shuflr_64( s1); \
-   s2 = mm256_swap_128( s2 ); \
-   G_4X64( s0, s1, s2, s3 ); \
-   s3 = mm256_shuflr_64( s3 ); \
-   s1 = mm256_shufll_64( s1 ); \
-   s2 = mm256_swap_128( s2 );
-*/
-
 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -148,7 +124,7 @@ static const uint64_t blake2b_IV[8] =

 // process 2 columns in parallel
 // returns void, all args updated
-#define G_2X64(a,b,c,d) \
+#define G_128(a,b,c,d) \
   a = v128_add64( a, b ); \
   d = v128_ror64xor( d, a, 32 ); \
   c = v128_add64( c, d ); \
@@ -161,16 +137,16 @@ static const uint64_t blake2b_IV[8] =
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 { \
   v128u64_t t; \
-   G_2X64( s0, s2, s4, s6 ); \
-   G_2X64( s1, s3, s5, s7 ); \
+   G_128( s0, s2, s4, s6 ); \
+   G_128( s1, s3, s5, s7 ); \
   t =  v128_alignr64( s7, s6, 1 ); \
   s6 = v128_alignr64( s6, s7, 1 ); \
   s7 = t; \
   t =  v128_alignr64( s2, s3, 1 ); \
   s2 = v128_alignr64( s3, s2, 1 ); \
   s3 = t; \
-   G_2X64( s0, s2, s5, s6 ); \
-   G_2X64( s1, s3, s4, s7 ); \
+   G_128( s0, s2, s5, s6 ); \
+   G_128( s1, s3, s4, s7 ); \
   t =  v128_alignr64( s6, s7, 1 ); \
   s6 = v128_alignr64( s7, s6, 1 ); \
   s7 = t; \