v3.8.2.1

2025-09-17 23:44:27 +00:00 · 2018-02-17 13:52:24 -05:00
parent d60a268972
commit 502ed0b1fe
21 changed files with 261 additions and 339 deletions
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -491,14 +491,14 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i *buffer = (__m256i*)state->buffer;
    __m256i msg[2];
    int i;
-    int blocks = (int)len / 32;
-    state-> rembytes = (int)len % 32;
+    int blocks = (int)len >> 5;
+    state-> rembytes = (int)len & 0x1F;

    // full blocks
    for ( i = 0; i < blocks; i++, vdata+=2 )
    {
-       msg[0] = mm256_bswap_32( vdata[ i   ] );
-       msg[1] = mm256_bswap_32( vdata[ i+1 ] );
+       msg[0] = mm256_bswap_32( vdata[ 0] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
       rnd512_2way( state, msg );
    }

@@ -533,7 +533,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
    finalization512_2way( state, (uint32*)hashval );

    if ( state->hashbitlen > 512 )
-        finalization512_2way( state, (uint32*)( hashval+128 ) );
+        finalization512_2way( state, (uint32*)( hashval+32 ) );
    return 0;
 }

@@ -575,7 +575,7 @@ int luffa_2way_update_close( luffa_2way_context *state,

    finalization512_2way( state, (uint32*)output );
    if ( state->hashbitlen > 512 )
-        finalization512_2way( state, (uint32*)( output+128 ) );
+        finalization512_2way( state, (uint32*)( output+32 ) );

    return 0;
 }
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -1,5 +1,6 @@
 #include "allium-gate.h"
 #include <memory.h>
+#include <mm_malloc.h>

 #if defined (ALLIUM_4WAY)	

@@ -18,14 +19,15 @@ typedef struct {

 } allium_4way_ctx_holder;

-static allium_4way_ctx_holder allium_4way_ctx;
+static __thread allium_4way_ctx_holder allium_4way_ctx;

-void init_allium_4way_ctx()
+bool init_allium_4way_ctx()
 {
   keccak256_4way_init( &allium_4way_ctx.keccak );
   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
   skein256_4way_init( &allium_4way_ctx.skein );
   init_groestl256( &allium_4way_ctx.groestl, 32 );
+   return true;
 }

 void allium_4way_hash( void *state, const void *input )
--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -5,11 +5,11 @@ int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
 bool register_allium_algo( algo_gate_t* gate )
 {
 #if defined (ALLIUM_4WAY)
-  init_allium_4way_ctx();
+  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_4way;
  gate->hash      = (void*)&allium_4way_hash;
 #else
-  init_allium_ctx();
+  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
--- a/algo/lyra2/allium-gate.h
+++ b/algo/lyra2/allium-gate.h
@@ -16,14 +16,14 @@ bool register_allium_algo( algo_gate_t* gate );
 void allium_4way_hash( void *state, const void *input );
 int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done );
-void init_allium_4way_ctx();
+bool init_allium_4way_ctx();

 #endif

 void allium_hash( void *state, const void *input );
 int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done );
-void init_allium_ctx();
+bool init_allium_ctx();

 #endif

--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -12,9 +12,9 @@
 #include "lyra2.h"

 typedef struct {
-        cubehashParam            cube;
        sph_blake256_context     blake;
        sph_keccak256_context    keccak;
+        cubehashParam            cube;
        sph_skein256_context     skein;
 #if defined (__AES__)
        hashState_groestl256     groestl;
@@ -23,9 +23,9 @@ typedef struct {
 #endif
 } allium_ctx_holder;

-static allium_ctx_holder allium_ctx;
+static __thread allium_ctx_holder allium_ctx;

-void init_allium_ctx()
+bool init_allium_ctx()
 {
        sph_keccak256_init( &allium_ctx.keccak );
        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
@@ -35,6 +35,7 @@ void init_allium_ctx()
 #else
        sph_groestl256_init( &allium_ctx.groestl );
 #endif
+        return true;
 }

 void allium_hash(void *state, const void *input)
--- a/algo/lyra2/allium.c.broke
+++ b/algo/lyra2/allium.c.broke
@@ -1,123 +0,0 @@
-#include "allium-gate.h"
-#include <memory.h>
-#include "algo/blake/sph_blake.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
-#if defined(__AES__)
-#include "algo/groestl/aes_ni/hash-groestl256.h"
-#else
-#include "algo/groestl/sph_groestl.h"
-#endif
-
-typedef struct {
-        cubehashParam            cube;
-        sph_blake256_context     blake;
-        sph_keccak256_context    keccak;
-        sph_skein256_context     skein;
-#if defined (__AES__)
-        hashState_groestl256     groestl;
-#else
-        sph_groestl256_context   groestl;
-#endif
-} allium_ctx_holder;
-
-static allium_ctx_holder allium_ctx;
-static __thread sph_blake256_context allium_blake_mid;
-
-void init_allium_ctx()
-{
-        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
-        sph_blake256_init( &allium_ctx.blake );
-        sph_keccak256_init( &allium_ctx.keccak );
-        sph_skein256_init( &allium_ctx.skein );
-#if defined (__AES__)
-        init_groestl256( &allium_ctx.groestl, 32 );
-#else
-        sph_groestl256_init( &allium_ctx.groestl );
-#endif
-}
-
-void allium_blake256_midstate( const void* input )
-{
-    memcpy( &allium_blake_mid, &allium_ctx.blake, sizeof allium_blake_mid );
-    sph_blake256( &allium_blake_mid, input, 64 );
-}
-
-void allium_hash( void *state, const void *input )
-{
-   allium_ctx_holder ctx __attribute__ ((aligned (64))); 
-   memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
-   uint8_t hash[128] __attribute__ ((aligned (64)));
-   const int midlen = 64;            // bytes
-   const int tail   = 80 - midlen;   // 16
-
-   memcpy( &ctx.blake, &allium_blake_mid, sizeof allium_blake_mid );
-   sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
-   sph_blake256_close( &ctx.blake, hash );
-
-   sph_keccak256( &ctx.keccak, hash, 32 );
-   sph_keccak256_close(&ctx.keccak, hash);
-
-   LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
-//   LYRA2REV2( allium_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 8, 8 );
-
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
-
-   LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
-//   LYRA2REV2( allium_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 8, 8 );
-
-   sph_skein256( &ctx.skein, hash, 32 );
-   sph_skein256_close( &ctx.skein, hash );
-
-#if defined (__AES__)
-   update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
-#else
-   sph_groestl256( &ctx.skein, hash, 32 );
-   sph_groestl256_close( &ctx.skein, hash );
-#endif
-
-   memcpy( state, hash, 32 );
-}
-
-int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done )
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-        uint32_t hash[8] __attribute__((aligned(64)));
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-        const uint32_t Htarg = ptarget[7];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-        swab32_array( endiandata, pdata, 20 );
-
-        allium_blake256_midstate( endiandata );
-
-	do {
-		be32enc(&endiandata[19], nonce);
-		allium_hash(hash, endiandata);
-
-		if (hash[7] <= Htarg )
-                {
-                   if( fulltest(hash, ptarget) )
-                   {
-			pdata[19] = nonce;
-                        work_set_target_ratio( work, hash );
-			*hashes_done = pdata[19] - first_nonce;
-		   	return 1;
-		   }
-                }
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -68,13 +68,13 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //Tries to allocate enough space for the whole memory matrix

   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   // for Lyra2REv2, nCols = 4, v1 was using 8
   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   uint64_t *ptrWord = wholeMatrix;

-   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );

   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
@@ -232,9 +232,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    //Tries to allocate enough space for the whole memory matrix

    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

-    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );

    //==== Getting the password + salt + basil padded with 10*1 ============//
    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
@@ -380,18 +380,17 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( i, 32 );
-//   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;

-//#if defined (__AVX2__)
-//   memset_zero_m256i( (__m256i*)wholeMatrix, i<<5 );
-//#elif defined(__AVX__)
-//   memset_zero_m128i( (__m128i*)wholeMatrix, i<<4 );
-//#else
-   memset(wholeMatrix, 0, i);
-//#endif
+#if defined(__AVX2__)
+   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
+#elif defined(__AVX__)
+   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
+#else
+   memset( wholeMatrix, 0, i );
+#endif

   uint64_t *ptrWord = wholeMatrix;

@@ -413,8 +412,8 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   memcpy(ptrByte, salt, saltlen);
   ptrByte += saltlen;

-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
+//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+//                       - (saltlen + pwdlen) );

   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
   memcpy(ptrByte, &kLen, sizeof(int64_t));
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -19,12 +19,13 @@ typedef struct {

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;

-void init_lyra2rev2_4way_ctx()
+bool init_lyra2rev2_4way_ctx()
 {
   keccak256_4way_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
   skein256_4way_init( &l2v2_4way_ctx.skein );
   bmw256_4way_init( &l2v2_4way_ctx.bmw );
+   return true;
 }

 void lyra2rev2_4way_hash( void *state, const void *input )
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -14,18 +14,20 @@ bool lyra2rev2_thread_init()

   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v2_wholeMatrix = _mm_malloc( i, 64 );
-
+#if defined (LYRA2REV2_4WAY)
+   init_lyra2rev2_4way_ctx();;
+#else
+   init_lyra2rev2_ctx();
+#endif
   return l2v2_wholeMatrix;
 }

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
 #if defined (LYRA2REV2_4WAY)
-  init_lyra2rev2_4way_ctx();
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
-  init_lyra2rev2_ctx();
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -20,7 +20,7 @@ void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );

-void init_lyra2rev2_4way_ctx();
+bool init_lyra2rev2_4way_ctx();

 #endif

@@ -29,7 +29,7 @@ void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );

-void init_lyra2rev2_ctx();
+bool init_lyra2rev2_ctx();

 #endif

--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -21,7 +21,7 @@ typedef struct {
 static lyra2v2_ctx_holder lyra2v2_ctx;
 static __thread sph_blake256_context l2v2_blake_mid;

-void init_lyra2rev2_ctx()
+bool init_lyra2rev2_ctx()
 {
        cubehashInit( &lyra2v2_ctx.cube1, 256, 16, 32 );
        cubehashInit( &lyra2v2_ctx.cube2, 256, 16, 32 );
@@ -29,6 +29,7 @@ void init_lyra2rev2_ctx()
        sph_keccak256_init( &lyra2v2_ctx.keccak );
        sph_skein256_init( &lyra2v2_ctx.skein );
        sph_bmw256_init( &lyra2v2_ctx.bmw );
+        return true;
 }

 void l2v2_blake256_midstate( const void* input )
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -42,7 +42,7 @@ inline void initState( uint64_t State[/*16*/] )
 {
 #if defined (__AVX2__)

-  __m256i* state = (__m256i*)State;
+  __m256i *state = (__m256i*)State;
 
  state[0] = _mm256_setzero_si256();
  state[1] = _mm256_setzero_si256();
@@ -53,7 +53,7 @@ inline void initState( uint64_t State[/*16*/] )

 #elif defined (__AVX__)

-  __m128i* state = (__m128i*)State;
+  __m128i *state = (__m128i*)State;

  state[0] = _mm_setzero_si128();
  state[1] = _mm_setzero_si128();
@@ -123,8 +123,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )

    const int len_m256i = len / 32;
    const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
-    __m256i* state = (__m256i*)State;
-    __m256i* out   = (__m256i*)Out;
+    __m256i *state = (__m256i*)State;
+    __m256i *out   = (__m256i*)Out;
    int i;

    //Squeezes full blocks
@@ -141,8 +141,8 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )

    const int len_m128i = len / 16;
    const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
-    __m128i* state = (__m128i*)State;
-    __m128i* out   = (__m128i*)Out;
+    __m128i *state = (__m128i*)State;
+    __m128i *out   = (__m128i*)Out;
    int i;

    //Squeezes full blocks
@@ -186,19 +186,27 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
 {
 #if defined (__AVX2__)

-    __m256i* state = (__m256i*)State;
-    __m256i* in = (__m256i*)In;
+    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
+    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
+    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
+    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
+    const __m256i *in = (const __m256i*)In;

-    state[0] = _mm256_xor_si256( state[0], in[0] );
-    state[1] = _mm256_xor_si256( state[1], in[1] );
-    state[2] = _mm256_xor_si256( state[2], in[2] );
+    state0 = _mm256_xor_si256( state0, in[0] );
+    state1 = _mm256_xor_si256( state1, in[1] );
+    state2 = _mm256_xor_si256( state2, in[2] );

-    LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
+    LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
+
+    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
+    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
+    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
+    _mm256_store_si256( casto_m256i( State, 3 ), state3 );

 #elif defined (__AVX__)

-    __m128i* state = (__m128i*)State;
-    __m128i* in    = (__m128i*)In;
+    __m128i *state = (__m128i*)State;
+    const __m128i *in = (const __m128i*)In;

    state[0] = _mm_xor_si128( state[0], in[0] );
    state[1] = _mm_xor_si128( state[1], in[1] );
@@ -245,18 +253,26 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
 #if defined (__AVX2__)

-    __m256i* state = (__m256i*)State;
-    __m256i* in    = (__m256i*)In;
+    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
+    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
+    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
+    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
+    const __m256i *in = (const __m256i*)In;

-    state[0] = _mm256_xor_si256( state[0], in[0] );
-    state[1] = _mm256_xor_si256( state[1], in[1] );
+    state0 = _mm256_xor_si256( state0, in[0] );
+    state1 = _mm256_xor_si256( state1, in[1] );

-    LYRA_12_ROUNDS_AVX2( state[0], state[1], state[2], state[3] );
+    LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
+
+    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
+    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
+    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
+    _mm256_store_si256( casto_m256i( State, 3 ), state3 );

 #elif defined (__AVX__)

-    __m128i* state = (__m128i*)State;
-    __m128i* in    = (__m128i*)In;
+    __m128i *state = (__m128i*)State;
+    const __m128i *in = (const __m128i*)In;

    state[0] = _mm_xor_si128( state[0], in[0] );
    state[1] = _mm_xor_si128( state[1], in[1] );
@@ -292,7 +308,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
 * @param state     The current state of the sponge
 * @param rowOut    Row to receive the data squeezed
 */
-inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
+inline void reducedSqueezeRow0( uint64_t *State, uint64_t *rowOut,
                                uint64_t nCols )
 {
    int i;
@@ -301,24 +317,19 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,

 #if defined (__AVX2__)

-    __m256i* state = (__m256i*)State;
-    __m256i  state0 = _mm256_load_si256(  state    );
-    __m256i  state1 = _mm256_load_si256( &state[1] );
-    __m256i  state2 = _mm256_load_si256( &state[2] );
-    __m256i  state3 = _mm256_load_si256( &state[3] );
+    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
+    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
+    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
+    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
+    __m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

-    __m256i* out   = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
-
-    for ( i = 0; i < 9; i += 3)
-    {
-        _mm_prefetch( out - i,     _MM_HINT_T0 );
-        _mm_prefetch( out - i - 2, _MM_HINT_T0 );
-    }
+    __builtin_prefetch( out,    1, 0 );
+    __builtin_prefetch( out -2, 1, 0 );
+    __builtin_prefetch( out -4, 1, 0 );

    for ( i = 0; i < nCols; i++ )
    {
-       _mm_prefetch( out -  9, _MM_HINT_T0 );
-       _mm_prefetch( out - 11, _MM_HINT_T0 );
+       __builtin_prefetch( out -i-6, 1, 0 );
                   
       out[0] = state0;
       out[1] = state1;
@@ -330,15 +341,14 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
       LYRA_ROUND_AVX2( state0, state1, state2, state3 );
    }

-   _mm256_store_si256( state,     state0 );
-   _mm256_store_si256( &state[1], state1 );
-   _mm256_store_si256( &state[2], state2 );
-   _mm256_store_si256( &state[3], state3 );
-
+    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
+    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
+    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
+    _mm256_store_si256( casto_m256i( State, 3 ), state3 );

 #elif defined (__AVX__)

-    __m128i* state = (__m128i*)State;
+    __m128i *state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
    __m128i  state1 = _mm_load_si128( &state[1] );
    __m128i  state2 = _mm_load_si128( &state[2] );
@@ -348,7 +358,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    __m128i  state6 = _mm_load_si128( &state[6] );
    __m128i  state7 = _mm_load_si128( &state[7] );

-    __m128i* out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
+    __m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );

    for ( i = 0; i < 6; i += 3)
    {
@@ -387,7 +397,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,

 #else

-    uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+    uint64_t *ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]

    for ( i = 0; i < nCols; i++ )
    {
@@ -422,37 +432,31 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
 * @param rowIn		Row to feed the sponge
 * @param rowOut	Row to receive the sponge's output
 */
-inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
+inline void reducedDuplexRow1( uint64_t *State, const uint64_t *rowIn,
                               uint64_t *rowOut, uint64_t nCols )
 {
    int i;

 #if defined (__AVX2__)

-    __m256i* state = (__m256i*)State;
-    __m256i  state0 = _mm256_load_si256(  state    );
-    __m256i  state1 = _mm256_load_si256( &state[1] );
-    __m256i  state2 = _mm256_load_si256( &state[2] );
-    __m256i  state3 = _mm256_load_si256( &state[3] );
+    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
+    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
+    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
+    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
+    const __m256i *in = (const __m256i*)rowIn;
+    __m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

-    __m256i* in    = (__m256i*)rowIn;
-    __m256i* out   = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
-
-    for ( i = 0; i < 9; i += 3)
-    {
-        _mm_prefetch( in  + i,     _MM_HINT_T0 );
-        _mm_prefetch( in  + i + 2, _MM_HINT_T0 );
-        _mm_prefetch( out - i,     _MM_HINT_T0 );
-        _mm_prefetch( out - i - 2, _MM_HINT_T0 );
-    }
+    __builtin_prefetch( in,     0, 0 );
+    __builtin_prefetch( in  +2, 0, 0 );
+    __builtin_prefetch( in  +4, 0, 0 );
+    __builtin_prefetch( out,    1, 0 );
+    __builtin_prefetch( out -2, 1, 0 );
+    __builtin_prefetch( out -4, 1, 0 );

    for ( i = 0; i < nCols; i++ )
    {
-
-        _mm_prefetch( in  +  9, _MM_HINT_T0 );
-        _mm_prefetch( in  + 11, _MM_HINT_T0 );
-        _mm_prefetch( out -  9, _MM_HINT_T0 );
-        _mm_prefetch( out - 11, _MM_HINT_T0 );
+         __builtin_prefetch( in  +i+6, 0, 0 );
+         __builtin_prefetch( out -i-6, 1, 0 );
 
         state0 = _mm256_xor_si256( state0, in[0] );
         state1 = _mm256_xor_si256( state1, in[1] );
@@ -470,14 +474,14 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
         out -= BLOCK_LEN_M256I;
    }

-   _mm256_store_si256( state,     state0 );
-   _mm256_store_si256( &state[1], state1 );
-   _mm256_store_si256( &state[2], state2 );
-   _mm256_store_si256( &state[3], state3 );
+    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
+    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
+    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
+    _mm256_store_si256( casto_m256i( State, 3 ), state3 );

 #elif defined (__AVX__)

-    __m128i* state = (__m128i*)State;
+    __m128i *state = (__m128i*)State;
    __m128i  state0 = _mm_load_si128(  state    );
    __m128i  state1 = _mm_load_si128( &state[1] );
    __m128i  state2 = _mm_load_si128( &state[2] );
@@ -487,8 +491,8 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
    __m128i  state6 = _mm_load_si128( &state[6] );
    __m128i  state7 = _mm_load_si128( &state[7] );

-    __m128i*  in   = (__m128i*)rowIn;
-    __m128i* out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
+    const __m128i *in = (const __m128i*)rowIn;
+    __m128i *out = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );

    for ( i = 0; i < 6; i += 3)
    {
@@ -540,8 +544,8 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,

 #else

-    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    const uint64_t *ptrWordIn = (const uint64_t*)rowIn;        //In Lyra2: pointer to prev
+    uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -600,7 +604,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
 * @param rowOut         Row receiving the output
 *
 */
-inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
+inline void reducedDuplexRowSetup( uint64_t *State, const uint64_t *rowIn,
                                   uint64_t *rowInOut, uint64_t *rowOut,
                                   uint64_t nCols )
 {
@@ -608,35 +612,30 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,

 #if defined (__AVX2__)

-    __m256i* state = (__m256i*)State;
-    __m256i  state0 = _mm256_load_si256(  state    );
-    __m256i  state1 = _mm256_load_si256( &state[1] );
-    __m256i  state2 = _mm256_load_si256( &state[2] );
-    __m256i  state3 = _mm256_load_si256( &state[3] );
+    register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
+    register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
+    register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
+    register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
+    const __m256i *in = (const __m256i*)rowIn;
+    __m256i *inout = (__m256i*)rowInOut;
+    __m256i *out = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m256i t0, t1, t2;

-    __m256i* in    = (__m256i*)rowIn;
-    __m256i* inout = (__m256i*)rowInOut;
-    __m256i* out   = (__m256i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
-    __m256i  t0, t1, t2;
-
-    for ( i = 0; i < 9; i += 3)
-    {
-        _mm_prefetch( in    + i,     _MM_HINT_T0 );
-        _mm_prefetch( in    + i + 2, _MM_HINT_T0 );
-        _mm_prefetch( inout + i,     _MM_HINT_T0 );
-        _mm_prefetch( inout + i + 2, _MM_HINT_T0 );
-        _mm_prefetch( out   - i,     _MM_HINT_T0 );
-        _mm_prefetch( out   - i - 2, _MM_HINT_T0 );
-    }
+    __builtin_prefetch( in,       0, 0 );
+    __builtin_prefetch( in    +2, 0, 0 );
+    __builtin_prefetch( in    +4, 0, 0 );
+    __builtin_prefetch( inout,    1, 0 );
+    __builtin_prefetch( inout +2, 1, 0 );
+    __builtin_prefetch( inout +4, 1, 0 );
+    __builtin_prefetch( out,      1, 0 );
+    __builtin_prefetch( out   -2, 1, 0 );
+    __builtin_prefetch( out   -4, 1, 0 );

    for ( i = 0; i < nCols; i++ )
    {
-       _mm_prefetch( in    +  9, _MM_HINT_T0 );
-       _mm_prefetch( in    + 11, _MM_HINT_T0 );
-       _mm_prefetch( inout +  9, _MM_HINT_T0 );
-       _mm_prefetch( inout + 11, _MM_HINT_T0 );
-       _mm_prefetch( out   -  9, _MM_HINT_T0 );
-       _mm_prefetch( out   - 11, _MM_HINT_T0 );
+       __builtin_prefetch( in    +i+6, 0, 0 );
+       __builtin_prefetch( inout +i+6, 1, 0 );
+       __builtin_prefetch( out   -i-6, 1, 0 );

       state0 = _mm256_xor_si256( state0,
                                  _mm256_add_epi64( in[0], inout[0] ) );
@@ -670,16 +669,16 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       out   -= BLOCK_LEN_M256I;
    }

-   _mm256_store_si256( state,     state0 );
-   _mm256_store_si256( &state[1], state1 );
-   _mm256_store_si256( &state[2], state2 );
-   _mm256_store_si256( &state[3], state3 );
+    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
+    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
+    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
+    _mm256_store_si256( casto_m256i( State, 3 ), state3 );

 #elif defined (__AVX__)

-    __m128i* in    = (__m128i*)rowIn;
-    __m128i* inout = (__m128i*)rowInOut;
-    __m128i* out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );
+    const __m128i *in = (const __m128i*)rowIn;
+    __m128i *inout = (__m128i*)rowInOut;
+    __m128i *out   = (__m128i*)rowOut + ( (nCols-1) * BLOCK_LEN_M128I );

    for ( i = 0; i < 6; i += 3)
    {
@@ -691,12 +690,12 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
        _mm_prefetch( out   - i - 2, _MM_HINT_T0 );
    }

-    __m128i* state = (__m128i*)State;
+    __m128i *state = (__m128i*)State;

    // For the last round in this function not optimized for AVX
-    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    const uint64_t *ptrWordIn = rowIn;        //In Lyra2: pointer to prev
+    uint64_t *ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
+    uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -757,9 +756,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,

 #else

-    uint64_t* ptrWordIn = rowIn;        //In Lyra2: pointer to prev
-    uint64_t* ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
-    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    const uint64_t *ptrWordIn = (const uint64_t*)rowIn;        //In Lyra2: pointer to prev
+    uint64_t *ptrWordInOut = rowInOut;  //In Lyra2: pointer to row*
+    uint64_t *ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++ )
    {
@@ -834,7 +833,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
 *
 */

-inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
+inline void reducedDuplexRow( uint64_t *State, const uint64_t *rowIn,
                              uint64_t *rowInOut, uint64_t *rowOut,
                              uint64_t nCols )
 {
@@ -842,35 +841,30 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,

 #if defined __AVX2__

-   __m256i* state = (__m256i*)State;
-   __m256i  state0 = _mm256_load_si256(  state    );
-   __m256i  state1 = _mm256_load_si256( &state[1] );
-   __m256i  state2 = _mm256_load_si256( &state[2] );
-   __m256i  state3 = _mm256_load_si256( &state[3] );
+   register __m256i state0 = _mm256_load_si256( casto_m256i( State, 0 ) );
+   register __m256i state1 = _mm256_load_si256( casto_m256i( State, 1 ) );
+   register __m256i state2 = _mm256_load_si256( casto_m256i( State, 2 ) );
+   register __m256i state3 = _mm256_load_si256( casto_m256i( State, 3 ) );
+   const __m256i* in    = (const __m256i*)rowIn;
+   __m256i *inout = (__m256i*)rowInOut;
+   __m256i *out = (__m256i*)rowOut;
+   __m256i t0, t1, t2;

-   __m256i* in    = (__m256i*)rowIn;
-   __m256i* inout = (__m256i*)rowInOut;
-   __m256i* out   = (__m256i*)rowOut;
-   __m256i  t0, t1, t2;
-
-   for ( i = 0; i < 9; i += 3)
-   {
-       _mm_prefetch( in    + i,     _MM_HINT_T0 );
-       _mm_prefetch( in    + i + 2, _MM_HINT_T0 );
-       _mm_prefetch( out   + i,     _MM_HINT_T0 );
-       _mm_prefetch( out   + i + 2, _MM_HINT_T0 );
-       _mm_prefetch( inout + i,     _MM_HINT_T0 );
-       _mm_prefetch( inout + i + 2, _MM_HINT_T0 );
-   }
+   __builtin_prefetch( in,        0, 0 );
+   __builtin_prefetch( in     +2, 0, 0 );
+   __builtin_prefetch( in     +4, 0, 0 );
+   __builtin_prefetch( inout,     1, 0 );
+   __builtin_prefetch( inout  +2, 1, 0 );
+   __builtin_prefetch( inout  +4, 1, 0 );
+   __builtin_prefetch( out,       1, 0 );
+   __builtin_prefetch( out    +2, 1, 0 );
+   __builtin_prefetch( out    +4, 1, 0 );

   for ( i = 0; i < nCols; i++ )
   {
-      _mm_prefetch( in    +  9, _MM_HINT_T0 );
-      _mm_prefetch( in    + 11, _MM_HINT_T0 );
-      _mm_prefetch( out   +  9, _MM_HINT_T0 );
-      _mm_prefetch( out   + 11, _MM_HINT_T0 );
-      _mm_prefetch( inout +  9, _MM_HINT_T0 );
-      _mm_prefetch( inout + 11, _MM_HINT_T0 );
+      __builtin_prefetch( in    +i+6, 0, 0 );
+      __builtin_prefetch( inout +i+6, 1, 0 );
+      __builtin_prefetch( out   +i+6, 1, 0 );

      //Absorbing "M[prev] [+] M[row*]"
      state0 = _mm256_xor_si256( state0,
@@ -906,17 +900,17 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
       inout += BLOCK_LEN_M256I;
   }

-   _mm256_store_si256( state,     state0 );
-   _mm256_store_si256( &state[1], state1 );
-   _mm256_store_si256( &state[2], state2 );
-   _mm256_store_si256( &state[3], state3 );
+    _mm256_store_si256( casto_m256i( State, 0 ), state0 );
+    _mm256_store_si256( casto_m256i( State, 1 ), state1 );
+    _mm256_store_si256( casto_m256i( State, 2 ), state2 );
+    _mm256_store_si256( casto_m256i( State, 3 ), state3 );

 #elif defined __AVX__

-    __m128i* state = (__m128i*)State;
-    __m128i* in    = (__m128i*)rowIn;
-    __m128i* inout = (__m128i*)rowInOut;
-    __m128i* out   = (__m128i*)rowOut;
+    __m128i *state = (__m128i*)State;
+    const __m128i *in = (const __m128i*)rowIn;
+    __m128i *inout = (__m128i*)rowInOut;
+    __m128i *out   = (__m128i*)rowOut;

    for ( i = 0; i < 6; i += 3)
    {
@@ -929,9 +923,9 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
    }

    // for the last round in this function that isn't optimized for AVX
-    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
+    uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++)
    {
@@ -997,9 +991,9 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,

 #else

-    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
-    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
-    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    uint64_t *ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    const uint64_t *ptrWordIn = (const uint64_t*)rowIn; //In Lyra2: pointer to prev
+    uint64_t *ptrWordOut = rowOut; //In Lyra2: pointer to row

    for ( i = 0; i < nCols; i++)
    {
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -159,23 +159,26 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){


 //---- Housekeeping
-void initState(uint64_t state[/*16*/]);
+void initState( uint64_t state[/*16*/] );

 //---- Squeezes
-void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
-void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
+void squeeze( uint64_t *state, unsigned char *out, unsigned int len );
+void reducedSqueezeRow0( uint64_t* state, uint64_t* row, uint64_t nCols );

 //---- Absorbs
-void absorbBlock(uint64_t *state, const uint64_t *in);
-void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
+void absorbBlock( uint64_t *state, const uint64_t *in );
+void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in );

 //---- Duplexes
-void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRow1( uint64_t *state, const uint64_t *rowIn,
+                        uint64_t *rowOut, uint64_t nCols);
+void reducedDuplexRowSetup( uint64_t *state, const uint64_t *rowIn,
+                     uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
+void reducedDuplexRow( uint64_t *state, const uint64_t *rowIn,
+                     uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );

 //---- Misc
-void printArray(unsigned char *array, unsigned int size, char *name);
+//void printArray(unsigned char *array, unsigned int size, char *name);

 ////////////////////////////////////////////////////////////////////////////////////////////////

--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -25,7 +25,6 @@ qubit_2way_ctx_holder qubit_2way_ctx;

 void init_qubit_2way_ctx()
 {
-        luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
        cubehashInit(&qubit_2way_ctx.cube,512,16,32);
        sph_shavite512_init(&qubit_2way_ctx.shavite);
        simd_2way_init( &qubit_2way_ctx.simd, 512 );