v3.8.2

2026-02-23 00:43:08 +00:00 · 2018-02-15 14:48:50 -05:00
parent e4265a6f11
commit d60a268972
57 changed files with 3469 additions and 2135 deletions
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -0,0 +1,163 @@
+#include "allium-gate.h"
+#include <memory.h>
+
+#if defined (ALLIUM_4WAY)	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/groestl/aes_ni/hash-groestl256.h"
+
+typedef struct {
+   blake256_4way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   hashState_groestl256      groestl;
+
+} allium_4way_ctx_holder;
+
+static allium_4way_ctx_holder allium_4way_ctx;
+
+void init_allium_4way_ctx()
+{
+   keccak256_4way_init( &allium_4way_ctx.keccak );
+   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &allium_4way_ctx.skein );
+   init_groestl256( &allium_4way_ctx.groestl, 32 );
+}
+
+void allium_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
+   uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
+   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+
+   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
+   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_close( &ctx.blake, vhash32 );
+
+   mm256_reinterleave_4x64( vhash64, vhash32, 256 );
+   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_close( &ctx.skein, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+
+   memcpy( state,    hash0, 32 );
+   memcpy( state+32, hash1, 32 );
+   memcpy( state+64, hash2, 32 );
+   memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake256_4way_init( &allium_4way_ctx.blake );
+   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      allium_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -0,0 +1,22 @@
+#include "allium-gate.h"
+
+int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
+
+bool register_allium_algo( algo_gate_t* gate )
+{
+#if defined (ALLIUM_4WAY)
+  init_allium_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_allium_4way;
+  gate->hash      = (void*)&allium_4way_hash;
+#else
+  init_allium_ctx();
+  gate->scanhash  = (void*)&scanhash_allium;
+  gate->hash      = (void*)&allium_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->set_target        = (void*)&alt_set_target;
+  gate->get_max64         = (void*)&get_max64_0xFFFFLL;
+  return true;
+};
+
+
--- a/algo/lyra2/allium-gate.h
+++ b/algo/lyra2/allium-gate.h
@@ -0,0 +1,29 @@
+#ifndef ALLIUM_GATE_H__
+#define ALLIUM_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY
+#endif
+
+bool register_allium_algo( algo_gate_t* gate );
+
+#if defined(ALLIUM_4WAY)
+
+void allium_4way_hash( void *state, const void *input );
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+void init_allium_4way_ctx();
+
+#endif
+
+void allium_hash( void *state, const void *input );
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+void init_allium_ctx();
+
+#endif
+
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -0,0 +1,111 @@
+#include "allium-gate.h"
+#include <memory.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#if defined(__AES__)
+#include "algo/groestl/aes_ni/hash-groestl256.h"
+#else
+#include "algo/groestl/sph_groestl.h"
+#endif
+#include "lyra2.h"
+
+typedef struct {
+        cubehashParam            cube;
+        sph_blake256_context     blake;
+        sph_keccak256_context    keccak;
+        sph_skein256_context     skein;
+#if defined (__AES__)
+        hashState_groestl256     groestl;
+#else
+        sph_groestl256_context   groestl;
+#endif
+} allium_ctx_holder;
+
+static allium_ctx_holder allium_ctx;
+
+void init_allium_ctx()
+{
+        sph_keccak256_init( &allium_ctx.keccak );
+        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
+        sph_skein256_init( &allium_ctx.skein );
+#if defined (__AES__)
+        init_groestl256( &allium_ctx.groestl, 32 );
+#else
+        sph_groestl256_init( &allium_ctx.groestl );
+#endif
+}
+
+void allium_hash(void *state, const void *input)
+{
+    uint32_t hash[8] __attribute__ ((aligned (64)));
+    allium_ctx_holder ctx __attribute__ ((aligned (32)));
+
+    memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
+    sph_blake256( &ctx.blake, input + 64, 16 );
+    sph_blake256_close( &ctx.blake, hash );
+
+    sph_keccak256( &ctx.keccak, hash, 32 );
+    sph_keccak256_close( &ctx.keccak, hash );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    sph_skein256( &ctx.skein, hash, 32 );
+    sph_skein256_close( &ctx.skein, hash );
+
+#if defined (__AES__)
+   update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
+#else
+   sph_groestl256( &ctx.groestl, hash, 32 );
+   sph_groestl256_close( &ctx.groestl, hash );
+#endif
+
+    memcpy(state, hash, 32);
+}
+
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done )
+{
+    uint32_t _ALIGN(128) hash[8];
+    uint32_t _ALIGN(128) endiandata[20];
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t nonce = first_nonce;
+
+    if ( opt_benchmark )
+        ptarget[7] = 0x3ffff;
+
+    for ( int i = 0; i < 19; i++ )
+        be32enc( &endiandata[i], pdata[i] );
+
+    sph_blake256_init( &allium_ctx.blake );
+    sph_blake256( &allium_ctx.blake, endiandata, 64 );
+
+    do {
+        be32enc( &endiandata[19], nonce );
+        allium_hash( hash, endiandata );
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        {
+            work_set_target_ratio( work, hash );
+            pdata[19] = nonce;
+            *hashes_done = pdata[19] - first_nonce;
+            return 1;
+        }
+        nonce++;
+
+    } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+    pdata[19] = nonce;
+    *hashes_done = pdata[19] - first_nonce + 1;
+    return 0;
+}
+
--- a/algo/lyra2/allium.c.broke
+++ b/algo/lyra2/allium.c.broke
@@ -0,0 +1,123 @@
+#include "allium-gate.h"
+#include <memory.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#if defined(__AES__)
+#include "algo/groestl/aes_ni/hash-groestl256.h"
+#else
+#include "algo/groestl/sph_groestl.h"
+#endif
+
+typedef struct {
+        cubehashParam            cube;
+        sph_blake256_context     blake;
+        sph_keccak256_context    keccak;
+        sph_skein256_context     skein;
+#if defined (__AES__)
+        hashState_groestl256     groestl;
+#else
+        sph_groestl256_context   groestl;
+#endif
+} allium_ctx_holder;
+
+static allium_ctx_holder allium_ctx;
+static __thread sph_blake256_context allium_blake_mid;
+
+void init_allium_ctx()
+{
+        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
+        sph_blake256_init( &allium_ctx.blake );
+        sph_keccak256_init( &allium_ctx.keccak );
+        sph_skein256_init( &allium_ctx.skein );
+#if defined (__AES__)
+        init_groestl256( &allium_ctx.groestl, 32 );
+#else
+        sph_groestl256_init( &allium_ctx.groestl );
+#endif
+}
+
+void allium_blake256_midstate( const void* input )
+{
+    memcpy( &allium_blake_mid, &allium_ctx.blake, sizeof allium_blake_mid );
+    sph_blake256( &allium_blake_mid, input, 64 );
+}
+
+void allium_hash( void *state, const void *input )
+{
+   allium_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
+   uint8_t hash[128] __attribute__ ((aligned (64)));
+   const int midlen = 64;            // bytes
+   const int tail   = 80 - midlen;   // 16
+
+   memcpy( &ctx.blake, &allium_blake_mid, sizeof allium_blake_mid );
+   sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
+   sph_blake256_close( &ctx.blake, hash );
+
+   sph_keccak256( &ctx.keccak, hash, 32 );
+   sph_keccak256_close(&ctx.keccak, hash);
+
+   LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+//   LYRA2REV2( allium_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
+
+   LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+//   LYRA2REV2( allium_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+   sph_skein256( &ctx.skein, hash, 32 );
+   sph_skein256_close( &ctx.skein, hash );
+
+#if defined (__AES__)
+   update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
+#else
+   sph_groestl256( &ctx.skein, hash, 32 );
+   sph_groestl256_close( &ctx.skein, hash );
+#endif
+
+   memcpy( state, hash, 32 );
+}
+
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done )
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t endiandata[20] __attribute__ ((aligned (64)));
+        uint32_t hash[8] __attribute__((aligned(64)));
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+        const uint32_t Htarg = ptarget[7];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+        swab32_array( endiandata, pdata, 20 );
+
+        allium_blake256_midstate( endiandata );
+
+	do {
+		be32enc(&endiandata[19], nonce);
+		allium_hash(hash, endiandata);
+
+		if (hash[7] <= Htarg )
+                {
+                   if( fulltest(hash, ptarget) )
+                   {
+			pdata[19] = nonce;
+                        work_set_target_ratio( work, hash );
+			*hashes_done = pdata[19] - first_nonce;
+		   	return 1;
+		   }
+                }
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -47,8 +47,9 @@
 */

 int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-               uint64_t pwdlen, const void *salt, uint64_t saltlen,
-               uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
+               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+               const uint64_t timeCost, const uint64_t nRows,
+               const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
@@ -67,12 +68,14 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //Tries to allocate enough space for the whole memory matrix

   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   // for Lyra2REv2, nCols = 4, v1 was using 8
   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   uint64_t *ptrWord = wholeMatrix;

+   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
   //but this ensures that the password copied locally will be overwritten as soon as possible
@@ -209,8 +212,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 }

 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-            uint64_t pwdlen, const void *salt, uint64_t saltlen,
-            uint64_t timeCost, uint64_t nRows, uint64_t nCols )
+            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+            const uint64_t timeCost, const uint64_t nRows,
+            const uint64_t nCols )
 {
    //========================== Basic variables ============================//
    uint64_t _ALIGN(256) state[16];
@@ -228,7 +232,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    //Tries to allocate enough space for the whole memory matrix

    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );

    //==== Getting the password + salt + basil padded with 10*1 ============//
    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
@@ -347,9 +353,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 }

 // Lyra2RE doesn't like the new wholeMatrix implementation
-int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
-             uint64_t pwdlen, const void *salt, uint64_t saltlen,
-             uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
+int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
+             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
@@ -374,18 +380,19 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   uint64_t *wholeMatrix = _mm_malloc( i, 32 );
+//   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;
-/*
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
-#elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
-#else
+
+//#if defined (__AVX2__)
+//   memset_zero_m256i( (__m256i*)wholeMatrix, i<<5 );
+//#elif defined(__AVX__)
+//   memset_zero_m128i( (__m128i*)wholeMatrix, i<<4 );
+//#else
   memset(wholeMatrix, 0, i);
-#endif
-*/
+//#endif
+
   uint64_t *ptrWord = wholeMatrix;

   //=== Getting the password + salt + basil padded with 10*1 ==========//
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -54,4 +54,6 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
            uint64_t pwdlen, const void *salt, uint64_t saltlen,
            uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,9 +7,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
-
-#include "algo/cubehash/sph_cubehash.h"
-//#include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 

 typedef struct {
@@ -18,19 +15,16 @@ typedef struct {
   cubehashParam             cube;
   skein256_4way_context     skein;
   bmw256_4way_context          bmw;
-//        sph_bmw256_context       bmw;
 } lyra2v2_4way_ctx_holder;

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;

 void init_lyra2rev2_4way_ctx()
 {
-//   blake256_4way_init( &l2v2_4way_ctx.blake );
   keccak256_4way_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
   skein256_4way_init( &l2v2_4way_ctx.skein );
   bmw256_4way_init( &l2v2_4way_ctx.bmw );
-//        sph_bmw256_init( &l2v2_4way_ctx.bmw );
 }

 void lyra2rev2_4way_hash( void *state, const void *input )
@@ -45,7 +39,6 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

   blake256_4way( &ctx.blake, input + (64<<2), 16 );
-//   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way_close( &ctx.blake, vhash );

   mm256_reinterleave_4x64( vhash64, vhash, 256 );
@@ -54,11 +47,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -71,36 +64,20 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   skein256_4way_close( &ctx.skein, vhash64 );
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-
-   // BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce
-   // good hash. As a result this ugly workaround of running bmw256-4way
-   // twice with data shuffled to get all 4 lanes of good hash.
-   // The hash is then shuffled back into the appropriate lanes for output.
-   // Not as fast but still faster than using sph serially. 
-
-   // shift lane 1 data to lane 2.
-   mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 );
+   mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, vhash );
-   uint32_t trash[8] __attribute__ ((aligned (32)));
-   // extract lane 0 as usual and lane2 containing lane 1 hash
-   mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 );
-   // shift lane2 data to lane 0 and lane 3 data to lane 2
-   mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 );
-   bmw256_4way_init( &ctx.bmw );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, vhash );
-   // extract lane 2 hash from lane 0 and lane 3 hash from lane 2.
-   mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 );
+
+   mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -144,7 +121,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,

      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
-//printf("found0\n");
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
@@ -152,7 +128,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
-//printf("found1\n");
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
@@ -160,7 +135,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
-//printf("found2\n");
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
@@ -168,7 +142,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
      }
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
-//printf("found3\n");
          found[3] = true;
          num_found++;
          nonces[3] = n+3;