v3.18.1

2026-02-22 16:33:08 +00:00 · 2021-10-10 22:50:19 -04:00
parent 2cd1507c2e
commit 47cc5dcff5
14 changed files with 2057 additions and 2827 deletions
--- a/22
+++ b/22
@@ -32,14 +32,26 @@ but different package names.
 $ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
+openssl 1.1.0e or higher.
 support depending on your CPU and compiler version:
-"-march=native" is always the best choice
+znver1 and znver2 should be recognized on most recent version of GCC and
 znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
 In the meantime here are some suggestions to compile with new CPUs:
-"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
+"-march=native" is usually the best choice, used by build.sh.
-"-msha"  Add SHA to other tuning options
+"-march=znver2 -mvaes" can be used for Ryzen 5000 if znver3 is not recongized.
 "-mcascadelake -msha" or
 "-mcometlake -mavx512 -msha" can be used for Rocket Lake.
 Features can also be added individually:
 "-msha" adds support for HW accelerated sha256.
 "-mavx512" adds support for 512 bit vectors
 "-mvaes" add support for parallel AES
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
--- a/16
+++ b/16
@@ -65,10 +65,24 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v3.18.1
 More speed for scrypt:
 - additional scryptn2 optimizations for all CPU architectures,
 - AVX2 is now used by default on CPUS with SHA but not AVX512,
 - scrypt:1024 performance lost in v3.18.0 is restored,
 - AVX512 & AVX2 improvements to scrypt:1024.
 Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
 Issue #337: fixed a problem that could display negative stats values in the
 first summary report if the report was forced prematurely due to a stratum
 diff change. The stats will still be invalid but should display zeros.
 v3.18.0
 Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
-  - AVX512 & SHA support for SHA256, AVX512 has priority,
+  - AVX512 & SHA support for sha256, AVX512 has priority,
  - up to 50% increase in hashrate,
  - memory requirements reduced 30-60% depending on CPU architecture,
  - memory usage displayed at startup,
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -146,6 +146,119 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
      output[i] = bswap_32( ostate[i] );
 }
 #if defined(__SHA__)
 static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, 
                    const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
                    uint32_t *ostate0, uint32_t *ostate1 )
 {
   uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16];
   int i;
   memcpy( pad0, key0 + 16, 16 );
   memcpy( pad0 + 4, keypad, 48 );
   memcpy( pad1, key1 + 16, 16 );
   memcpy( pad1 + 4, keypad, 48 );
   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
 		               tstate0, tstate1 );
   memcpy( ihash0, tstate0, 32 );
   memcpy( ihash1, tstate1, 32 );
   for ( i = 0; i < 8; i++ )
   {
      pad0[i] = ihash0[i] ^ 0x5c5c5c5c;
      pad1[i] = ihash1[i] ^ 0x5c5c5c5c;
   }
   for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
   sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
                               sha256_initial_state, sha256_initial_state );
   for ( i = 0; i < 8; i++ )
   {
      pad0[i] = ihash0[i] ^ 0x36363636;
      pad1[i] = ihash1[i] ^ 0x36363636;
   }
   for ( ; i < 16; i++ )      pad0[i] = pad1[i] = 0x36363636;
   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, 
                               sha256_initial_state, sha256_initial_state );
 }
 static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
            const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
            const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0,
            uint32_t *output1 )
 {
   uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8];
   uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
   int i, j;
   sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
                               tstate0, tstate1 );
   memcpy( ibuf0, salt0 + 16, 16 );
   memcpy( ibuf0 + 5, innerpad, 44 );
   memcpy( obuf0 + 8, outerpad, 32 );
   memcpy( ibuf1, salt1 + 16, 16 );
   memcpy( ibuf1 + 5, innerpad, 44 );
   memcpy( obuf1 + 8, outerpad, 32 );
   for ( i = 0; i < 4; i++ )
   {
      memcpy( obuf0, istate0, 32 );
      memcpy( obuf1, istate1, 32 );
      ibuf0[4] = ibuf1[4] = i + 1;
      sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
                                  obuf0, obuf1 );
      sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
                                  ostate0, ostate1 );
      for ( j = 0; j < 8; j++ )
      {
         output0[ 8*i + j ] = bswap_32( ostateb0[j] );
         output1[ 8*i + j ] = bswap_32( ostateb1[j] );
      }
   }
 }
 static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
                    uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
                    const uint32_t *salt0, const uint32_t *salt1,
                    uint32_t *output0, uint32_t *output1 )
 {
   uint32_t buf0[16], buf1[16];
   int i;
   sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
                               tstate0, tstate1 );   
   sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
                               tstate0, tstate1 );
   sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
                               tstate0, tstate1 );
   memcpy( buf0, tstate0, 32 );
   memcpy( buf0 + 8, outerpad, 32 );
   memcpy( buf1, tstate1, 32 );
   memcpy( buf1 + 8, outerpad, 32 );
   sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
                               ostate0, ostate1 );
   for ( i = 0; i < 8; i++ )
   {
      output0[i] = bswap_32( ostate0[i] );
      output1[i] = bswap_32( ostate1[i] );
   }
 }
 #endif
 #ifdef HAVE_SHA256_4WAY
 static const uint32_t keypad_4way[4 * 12] = {
@@ -643,10 +756,10 @@ static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[8 * 8];
+   uint32_t _ALIGN(128) tstate[ 8*8 ];
-   uint32_t _ALIGN(128) ostate[8 * 8];
+   uint32_t _ALIGN(128) ostate[ 8*8 ];
-   uint32_t _ALIGN(128) W[8 * 32];
+   uint32_t _ALIGN(128) W[ 8*32 ];
-   uint32_t _ALIGN(128) X[8 * 32];
+   uint32_t _ALIGN(128) X[ 8*32 ];
   uint32_t *V = (uint32_t*)scratchpad;
   intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
@@ -658,53 +771,45 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );
   dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
   if ( opt_param_n > 0x4000 )
   {
      scrypt_core_simd128_3buf( X,     V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_3buf( X+ 96, V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_2buf( X+192, V, N );
   }
   else
   {
      intrlv_2x128( W,     X,     X+ 32, 1024 );
      intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
      intrlv_2x128( W+128, X+128, X+160, 1024 );
      intrlv_2x128( W+192, X+192, X+224, 1024 );
      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
      dintrlv_2x128( X,     X+ 32, W,     1024 );
      dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
      dintrlv_2x128( X+128, X+160, W+128, 1024 );
      dintrlv_2x128( X+192, X+224, W+192, 1024 );
   }
   // SCRYPT CORE
   // AVX512
 /*
   // AVX512 16 way working
   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
                    X+256+160, X+256+192, X+256+224, 1024 );
   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
                  X+256, X+256+32, X+256+64, X+256+96, X+256+128, 
                  X+256+160, X+256+192, X+256+224, W, 1024 );
 */
 /*
   // AVX512 working
   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); 
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
 */   
 /*
   // AVX512, not working, very slow
   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
 */
  // AVX2
-/*
+
   // AVX2   
   // disable de/interleave for testing.
-   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+//   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
-*/
+
 /*
   // AVX2 working
@@ -714,23 +819,18 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   intrlv_2x128( W+192, X+192, X+224, 1024 );
   // working
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
   // working
-   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
-
+   if ( work_restart[thrid].restart ) return 0;
-   // working
+   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
-//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
-//   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
 //   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
   dintrlv_2x128( X,     X+ 32, W,     1024 );
   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
@@ -745,18 +845,10 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   intrlv_2x32( W+128, X+128, X+160, 1024 );
   intrlv_2x32( W+192, X+192, X+224, 1024 );
   // working, deprecated, not up to data
 //   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
     // deprecated, not up to date
 //   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
   // working
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
+   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -813,19 +905,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+192, V, N );
 */
-
+/**************
   scrypt_core_simd128_3buf( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_3buf( X+ 96, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+192, V, N );
-
+*************/
 /*
   // SSE2 working
   scrypt_core_simd128_4buf( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4buf( X+128, V, N );
 */
   if ( work_restart[thrid].restart ) return 0;
@@ -868,6 +954,39 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
                  W, 1024 );
   if ( opt_param_n > 0x4000 )
   {
      scrypt_core_simd128_3buf( X,     V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_3buf( X+ 96, V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_2buf( X+192, V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_3buf( X+256, V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_3buf( X+352, V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_2buf( X+448, V, N );
   }
   else
   {
      intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
      intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
      intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
      intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
      dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
      dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
      dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
      dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 );
   }
   // SCRYPT CORE
@@ -888,23 +1007,40 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   // AVX512 working
   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
   intrlv_4x32( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
   intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
   dintrlv_4x32( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
   dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */
 /*
-   // AVX512, not working, very slow
+   // AVX512, working
   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
   intrlv_4x128( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
   intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_4way_simd128( (__m512i*)(W+256),   (__m512i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N );
   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
   dintrlv_4x128( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
   dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */
  // AVX2
 /*
@@ -919,16 +1055,19 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
   intrlv_2x128( W+128, X+128, X+160, 1024 );
   intrlv_2x128( W+192, X+192, X+224, 1024 );
-
+   intrlv_2x128( W+256,     X+256,     X+256+ 32, 1024 );
-   // working
+   intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 );
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+   intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 );
-//   if ( work_restart[thrid].restart ) return 0;
+   intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 );
 //   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
   // working
   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256),      (__m256i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N );
   // working
 //   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
@@ -938,11 +1077,23 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 //   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+256),      (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N );
   dintrlv_2x128( X,     X+ 32, W,     1024 );
   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
   dintrlv_2x128( X+128, X+160, W+128, 1024 );
   dintrlv_2x128( X+192, X+224, W+192, 1024 );
   dintrlv_2x128( X+256,     X+256+ 32, W+256,     1024 );
   dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 );
   dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 );
   dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 );
 */
 /*
@@ -952,18 +1103,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   intrlv_2x32( W+128, X+128, X+160, 1024 );
   intrlv_2x32( W+192, X+192, X+224, 1024 );
   // working, deprecated, not up to data
 //   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
     // deprecated, not up to date
 //   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
   // working
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -1043,7 +1189,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+448, V, N );
 */
-
+/***************
   scrypt_core_simd128_3buf( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_3buf( X+ 96, V, N );
@@ -1055,17 +1201,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   scrypt_core_simd128_3buf( X+352, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+448, V, N );
-
+********************/
 /*
   // SSE2 working
   scrypt_core_simd128_4buf( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4buf( X+128, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4buf( X+256, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4buf( X+384, V, N );
 */
 /*
   scrypt_core_3way( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
@@ -1102,6 +1238,31 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 #if defined(__SHA__)
 static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
    uint32_t _ALIGN(128) tstate[ 2*8 ];
    uint32_t _ALIGN(128) ostate[ 2*8 ];
    uint32_t _ALIGN(128) W[ 2*32 ];
    uint32_t *V = (uint32_t*)scratchpad;
    memcpy( tstate,    midstate, 32 );
    memcpy( tstate+ 8, midstate, 32 );
    HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
                                  ostate, ostate+8 );
    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
                                   input, input+20,  W, W+32 );
    scrypt_core_simd128_2buf( W, V, N );
    if ( work_restart[thrid].restart ) return 0;
    PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
                                   output, output+8 );
   return 1;
 }
 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
@@ -1149,8 +1310,6 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
   scrypt_core_simd128( W+96, V, N );
 */
   // working
 //   scrypt_core_simd128_4buf( W, V, N );
   if ( work_restart[thrid].restart ) return 0;
@@ -1171,10 +1330,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[4 * 8];
+   uint32_t _ALIGN(128) tstate[ 4*8 ];
-   uint32_t _ALIGN(128) ostate[4 * 8];
+   uint32_t _ALIGN(128) ostate[ 4*8 ];
-   uint32_t _ALIGN(128) W[4 * 32];
+   uint32_t _ALIGN(128) W[ 4*32 ];
   uint32_t _ALIGN(128) X[4 * 32];
   uint32_t *V = (uint32_t*)scratchpad;
   intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
@@ -1184,7 +1342,21 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
   HMAC_SHA256_80_init_4way(W, tstate, ostate);
   PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
-   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+   if ( opt_param_n > 0x4000 )
   {
      uint32_t _ALIGN(128) X[ 4*32 ];
      dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
      scrypt_core_simd128_2buf( X, V, N );
      if ( work_restart[thrid].restart ) return 0;
      scrypt_core_simd128_2buf( X+64, V, N );
      intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
   }
   else
      scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
 //   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
 ////// SCRYPT_CORE   
@@ -1202,35 +1374,23 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128( X+96, V, N );
 */
-   
+/*   
   // working, double buffered linear simd, best for n2
   scrypt_core_simd128_2buf( X, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+64, V, N );
-  
+*/  
 /*
   scrypt_core_simd128_3buf( X, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128( X+96, V, N );
 */
   // working
 //   scrypt_core_simd128_4buf( X, V, N );
 /* 
   // original
   scrypt_core(X + 0 * 32, V, N);
 	scrypt_core(X + 1 * 32, V, N);
 	scrypt_core(X + 2 * 32, V, N);
 	scrypt_core(X + 3 * 32, V, N);
 */
 ////////////////////////////////
   if ( work_restart[thrid].restart ) return 0;
-   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+//   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
   PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
@@ -1247,22 +1407,22 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 {
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
+   uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
-	uint32_t midstate[8];
+   uint32_t midstate[8];
-	uint32_t n = pdata[19] - 1;
+   uint32_t n = pdata[19] - 1;
   int thr_id = mythr->id;  
   int throughput = scrypt_throughput;
-	int i;
+   int i;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-	for ( i = 0; i < throughput; i++ )
+   for ( i = 0; i < throughput; i++ )
-		memcpy( data + i * 20, pdata, 80 );
+      memcpy( data + i * 20, pdata, 80 );
   sha256_transform_le( midstate, data, sha256_initial_state );
-	do {
+   do {
      bool rc = true;
-		for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+      for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
      if ( throughput == 16 )
@@ -1276,7 +1436,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                                     opt_param_n, thr_id );
      else
 #endif
-      if ( throughput == 4 )
+      if ( throughput == 4 ) // slower on Ryzen than 8way
 #if defined(__SHA__)
         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
                                         opt_param_n, thr_id );
@@ -1284,10 +1444,17 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
         rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
                                     opt_param_n, thr_id );
 #endif
 #if defined(__SHA__)
      else
      if (throughput == 2 )  // slower on Ryzen than 4way_sha & 8way
         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
                                         opt_param_n, thr_id );
 #endif         
      else  // should never get here
         rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
                                opt_param_n, thr_id );
      // test the hash
      if ( rc )
      for ( i = 0; i < throughput; i++ )
      {
@@ -1319,11 +1486,11 @@ bool scrypt_miner_thread_init( int thr_id )
 bool register_scrypt_algo( algo_gate_t* gate )
 {
-#if defined(__SHA__)
+//#if defined(__SHA__)
-   gate->optimizations = SSE2_OPT | SHA_OPT;
+//   gate->optimizations = SSE2_OPT | SHA_OPT;
-#else
+//#else
   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
-#endif
+//#endif
   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_scrypt;
   opt_target_factor = 65536.0;
@@ -1332,16 +1499,29 @@ bool register_scrypt_algo( algo_gate_t* gate )
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   scrypt_throughput = 16;
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
   else      
      scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
 /* SHA is slower than AVX2 on Ryzen
 #elif defined(__SHA__)
   scrypt_throughput = 4;
   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
 */
 #elif defined(__AVX2__)
   scrypt_throughput = 8;   
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
   else
      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
 #else
   scrypt_throughput = 4;
   if ( opt_param_n > 0x4000 )
   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
   else
   scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
 #endif
   char t_units[4] = {0};
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -51,7 +51,6 @@ typedef struct {
   __m128i buf[64>>2];
   __m128i val[8];
   uint32_t count_high, count_low;
   bool initialized;
 } sha256_4way_context __attribute__ ((aligned (64)));
 void sha256_4way_init( sha256_4way_context *sc );
@@ -74,7 +73,6 @@ typedef struct {
   __m256i buf[64>>2];
   __m256i val[8];
   uint32_t count_high, count_low;
   bool initialized;
 } sha256_8way_context __attribute__ ((aligned (128)));
 void sha256_8way_init( sha256_8way_context *sc );
@@ -96,7 +94,6 @@ typedef struct {
   __m512i buf[64>>2];
   __m512i val[8];
   uint32_t count_high, count_low;
   bool initialized;
 } sha256_16way_context __attribute__ ((aligned (128)));
 void sha256_16way_init( sha256_16way_context *sc );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -70,6 +70,8 @@ extern "C"{
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
   const __m256i FIVE  = _mm256_set1_epi32( 5 ); \
   const __m256i THREE = _mm256_set1_epi32( 3 ); \
   sph_u32 Wlow, Whigh;
 #define READ_STATE8(state) do \
@@ -314,8 +316,7 @@ do { \
            _mm256_andnot_si256( xb3, xb2 ), \
            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   _mm256_set1_epi32(5UL) ) ), \
+                                   FIVE ) ), THREE ) ) ); \
               _mm256_set1_epi32(3UL) ) ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)
@@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	        C8, C9, CA, CB, CC, CD, CE, CF; \
 	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
-	sph_u32 Wlow, Whigh;
+   const __m128i FIVE  = _mm_set1_epi32( 5 ); \
   const __m128i THREE = _mm_set1_epi32( 3 ); \
   sph_u32 Wlow, Whigh;
 #define READ_STATE(state) do \
 { \
@@ -931,8 +934,8 @@ do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
            _mm_andnot_si128( xb3, xb2 ), \
            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), FIVE ) \
-                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+                   ) ), THREE ) ) ) ); \
   xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
 } while (0)
--- a/algo/swifftx/inttypes.h
+++ b/algo/swifftx/inttypes.h
@@ -18,16 +18,20 @@
 #ifndef __INTTYPES_H_
 #define __INTTYPES_H_
 #include <stdint.h>
 /* Use [u]intN_t if you need exactly N bits.
  XXX - doesn't handle the -mint8 option.  */
 typedef signed char swift_int8_t;
 typedef unsigned char swift_uint8_t;
- typedef int swift_int16_t;
+ typedef int32_t swift_int16_t;
 // typedef int swift_int16_t;
 typedef unsigned int swift_uint16_t;
- typedef long swift_int32_t;
+ typedef int32_t swift_int32_t;
 // typedef long swift_int32_t;
 typedef unsigned long swift_uint32_t;
 typedef long long swift_int64_t;
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -18,6 +18,8 @@
 //#include "stdbool.h"
 #include <memory.h>
 #include "simd-utils.h"
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // Constants and static tables portion.
 ///////////////////////////////////////////////////////////////////////////////////////////////
@@ -49,20 +51,20 @@
 // - A: the first operand. After the operation stores the sum of the two operands.
 // - B: the second operand. After the operation stores the difference between the first and the
 //   second operands.
-#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
+//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
 // Quickly reduces an integer modulo 257.
 //
 // Parameters:
 // - A: the input.
-#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
+//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
 // Since we need to do the setup only once, this is the indicator variable:
 static bool wasSetupDone = false;
 // This array stores the powers of omegas that correspond to the indices, which are the input
 // values. Known also as the "outer FFT twiddle factors".
-swift_int16_t multipliers[N];
+swift_int16_t multipliers[N] __attribute__ ((aligned (64)));
 // This array stores the powers of omegas, multiplied by the corresponding values.
 // We store this table to save computation time.
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
 // compression function, i is between 0 and 31, x_i is a 64-bit value.
 // One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
 // formula (2), section 3, page 6.
-swift_int16_t fftTable[256 * EIGHTH_N];
+swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));
 // The A's we use in SWIFFTX shall be random elements of Z_257.
 // We generated these A's from the decimal expansion of PI as follows:  we converted each
 // triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
 // element, otherwise move to the next triple of digits in the expansion. This guarntees that
 // the A's are random, provided that PI digits are.
-const swift_int16_t As[3 * M * N] =
+const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
 {141,  78, 139,  75, 238, 205, 129, 126,  22, 245, 197, 169, 142, 118, 105,  78,
  50, 149,  29, 208, 114,  34,  85, 117,  67, 148,  86, 256,  25,  49, 133,  93,
  95,  36,  68, 231, 211, 102, 151, 128, 224, 117, 193,  27, 102, 187,   7, 105,
@@ -636,9 +638,202 @@ void InitializeSWIFFTX()
 	wasSetupDone = true;
 }
 // In the original code the F matrix is rotated so it was not aranged
 // the same as all the other data. Rearanging F to match all the other
 // data made vectorizing possible, the compiler probably could have been
 // able to auto-vectorize with proper data organisation.
 // Also in the original code the custom 16 bit data types are all now 32
 // bit int32_t regardless of the type name.
 //
 void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 {
-	swift_int16_t *mult = multipliers;
+#if defined(__AVX2__)
   __m256i F[8] __attribute__ ((aligned (64)));
   __m256i *mul = (__m256i*)multipliers;
   __m256i *out = (__m256i*)output;
   __m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
   F[0] = _mm256_mullo_epi32( mul[0], *tbl );
   tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
   F[1] = _mm256_mullo_epi32( mul[1], *tbl );
   tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
   F[2] = _mm256_mullo_epi32( mul[2], *tbl );
   tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
   F[3] = _mm256_mullo_epi32( mul[3], *tbl );
   tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
   F[4] = _mm256_mullo_epi32( mul[4], *tbl );
   tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
   F[5] = _mm256_mullo_epi32( mul[5], *tbl );
   tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
   F[6] = _mm256_mullo_epi32( mul[6], *tbl );
   tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
   F[7] = _mm256_mullo_epi32( mul[7], *tbl );
   #define ADD_SUB( a, b ) \
   { \
      __m256i tmp = b; \
      b = _mm256_sub_epi32( a, b ); \
      a = _mm256_add_epi32( a, tmp ); \
   }
   ADD_SUB( F[0], F[1] );
   ADD_SUB( F[2], F[3] );
   ADD_SUB( F[4], F[5] );
   ADD_SUB( F[6], F[7] );
   F[3] = _mm256_slli_epi32( F[3], 4 );
   F[7] = _mm256_slli_epi32( F[7], 4 );
   ADD_SUB( F[0], F[2] );
   ADD_SUB( F[1], F[3] );
   ADD_SUB( F[4], F[6] );
   ADD_SUB( F[5], F[7] );  
   F[5] = _mm256_slli_epi32( F[5], 2 );
   F[6] = _mm256_slli_epi32( F[6], 4 );
   F[7] = _mm256_slli_epi32( F[7], 6 );
   ADD_SUB( F[0], F[4] );
   ADD_SUB( F[1], F[5] );
   ADD_SUB( F[2], F[6] );
   ADD_SUB( F[3], F[7] );
   #undef ADD_SUB
 #if defined (__AVX512VL__) && defined(__AVX512BW__)   
   #define Q_REDUCE( a ) \
       _mm256_sub_epi32( _mm256_and_si256( a, \
                 _mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) ) 
 #else   
   #define Q_REDUCE( a ) \
       _mm256_sub_epi32( _mm256_and_si256( a, \
                   m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) ) 
 #endif
   out[0] = Q_REDUCE( F[0] );  
   out[1] = Q_REDUCE( F[1] );                        
   out[2] = Q_REDUCE( F[2] );                        
   out[3] = Q_REDUCE( F[3] );                        
   out[4] = Q_REDUCE( F[4] );                        
   out[5] = Q_REDUCE( F[5] );                        
   out[6] = Q_REDUCE( F[6] );                        
   out[7] = Q_REDUCE( F[7] );
   #undef Q_REDUCE
 #elif defined(__SSE4_1__)
   __m128i F[16] __attribute__ ((aligned (64)));
   __m128i *mul = (__m128i*)multipliers;
   __m128i *out = (__m128i*)output;
   __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
   F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
   F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
   tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
   F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
   F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
   tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
   F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
   F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
   tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
   F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
   F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
   tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
   F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
   F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
   tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
   F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
   F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
   tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
   F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
   F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
   tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
   F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
   F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
   #define ADD_SUB( a, b ) \
   { \
      __m128i tmp = b; \
      b = _mm_sub_epi32( a, b ); \
      a = _mm_add_epi32( a, tmp ); \
   }
   ADD_SUB( F[ 0], F[ 2] );
   ADD_SUB( F[ 1], F[ 3] );
   ADD_SUB( F[ 4], F[ 6] );
   ADD_SUB( F[ 5], F[ 7] );
   ADD_SUB( F[ 8], F[10] );
   ADD_SUB( F[ 9], F[11] );
   ADD_SUB( F[12], F[14] );
   ADD_SUB( F[13], F[15] );
   F[ 6] = _mm_slli_epi32( F[ 6], 4 );
   F[ 7] = _mm_slli_epi32( F[ 7], 4 );
   F[14] = _mm_slli_epi32( F[14], 4 );
   F[15] = _mm_slli_epi32( F[15], 4 );
   ADD_SUB( F[ 0], F[ 4] );
   ADD_SUB( F[ 1], F[ 5] );
   ADD_SUB( F[ 2], F[ 6] );
   ADD_SUB( F[ 3], F[ 7] );
   ADD_SUB( F[ 8], F[12] );
   ADD_SUB( F[ 9], F[13] );
   ADD_SUB( F[10], F[14] );
   ADD_SUB( F[11], F[15] );
   F[10] = _mm_slli_epi32( F[10], 2 );
   F[11] = _mm_slli_epi32( F[11], 2 );
   F[12] = _mm_slli_epi32( F[12], 4 );
   F[13] = _mm_slli_epi32( F[13], 4 );
   F[14] = _mm_slli_epi32( F[14], 6 );
   F[15] = _mm_slli_epi32( F[15], 6 );
   ADD_SUB( F[ 0], F[ 8] );
   ADD_SUB( F[ 1], F[ 9] );
   ADD_SUB( F[ 2], F[10] );
   ADD_SUB( F[ 3], F[11] );
   ADD_SUB( F[ 4], F[12] );
   ADD_SUB( F[ 5], F[13] );
   ADD_SUB( F[ 6], F[14] );
   ADD_SUB( F[ 7], F[15] );
   #undef ADD_SUB
   #define Q_REDUCE( a ) \
      _mm_sub_epi32( _mm_and_si128( a, \
                   m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) ) 
   out[ 0] = Q_REDUCE( F[ 0] );
   out[ 1] = Q_REDUCE( F[ 1] );
   out[ 2] = Q_REDUCE( F[ 2] );
   out[ 3] = Q_REDUCE( F[ 3] );
   out[ 4] = Q_REDUCE( F[ 4] );
   out[ 5] = Q_REDUCE( F[ 5] );
   out[ 6] = Q_REDUCE( F[ 6] );
   out[ 7] = Q_REDUCE( F[ 7] );
   out[ 8] = Q_REDUCE( F[ 8] );
   out[ 9] = Q_REDUCE( F[ 9] );
   out[10] = Q_REDUCE( F[10] );
   out[11] = Q_REDUCE( F[11] );
   out[12] = Q_REDUCE( F[12] );
   out[13] = Q_REDUCE( F[13] );
   out[14] = Q_REDUCE( F[14] );
   out[15] = Q_REDUCE( F[15] );
   #undef Q_REDUCE
 #else   // < SSE4.1
   swift_int16_t *mult = multipliers;
   // First loop unrolling:
 	register swift_int16_t *table = &(fftTable[input[0] << 3]);
 /*
   swift_int32_t F[64];
@@ -666,11 +861,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
                F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
                F60, F61, F62, F63;
-   // First loop unrolling:
+	F0  = mult[0] * table[0];
-	register swift_int16_t *table = &(fftTable[input[0] << 3]);
+	F8  = mult[1] * table[1];
 	F0 = mult[0] * table[0];
 	F8 = mult[1] * table[1];
 	F16 = mult[2] * table[2];
 	F24 = mult[3] * table[3];
 	F32 = mult[4] * table[4];
@@ -678,90 +870,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	F48 = mult[6] * table[6];
 	F56 = mult[7] * table[7];
 	mult += 8;
 	table = &(fftTable[input[1] << 3]);
-	F1 = mult[0] * table[0];
+	F1  = mult[ 8] * table[0];
-	F9 = mult[1] * table[1];
+	F9  = mult[ 9] * table[1];
-	F17 = mult[2] * table[2];
+	F17 = mult[10] * table[2];
-	F25 = mult[3] * table[3];
+	F25 = mult[11] * table[3];
-	F33 = mult[4] * table[4];
+	F33 = mult[12] * table[4];
-	F41 = mult[5] * table[5];
+	F41 = mult[13] * table[5];
-	F49 = mult[6] * table[6];
+	F49 = mult[14] * table[6];
-	F57 = mult[7] * table[7];
+	F57 = mult[15] * table[7];
 	mult += 8;
 	table = &(fftTable[input[2] << 3]);
-	F2 = mult[0] * table[0];
+	F2  = mult[16] * table[0];
-	F10 = mult[1] * table[1];
+	F10 = mult[17] * table[1];
-	F18 = mult[2] * table[2];
+	F18 = mult[18] * table[2];
-	F26 = mult[3] * table[3];
+	F26 = mult[19] * table[3];
-	F34 = mult[4] * table[4];
+	F34 = mult[20] * table[4];
-	F42 = mult[5] * table[5];
+	F42 = mult[21] * table[5];
-	F50 = mult[6] * table[6];
+	F50 = mult[22] * table[6];
-	F58 = mult[7] * table[7];
+	F58 = mult[23] * table[7];
 	mult += 8;
 	table = &(fftTable[input[3] << 3]);
-	F3 = mult[0] * table[0];
+	F3  = mult[24] * table[0];
-	F11 = mult[1] * table[1];
+	F11 = mult[25] * table[1];
-	F19 = mult[2] * table[2];
+	F19 = mult[26] * table[2];
-	F27 = mult[3] * table[3];
+	F27 = mult[27] * table[3];
-	F35 = mult[4] * table[4];
+	F35 = mult[28] * table[4];
-	F43 = mult[5] * table[5];
+	F43 = mult[29] * table[5];
-	F51 = mult[6] * table[6];
+	F51 = mult[30] * table[6];
-	F59 = mult[7] * table[7];
+	F59 = mult[31] * table[7];
 	mult += 8;
 	table = &(fftTable[input[4] << 3]);
-	F4 = mult[0] * table[0];
+	F4  = mult[32] * table[0];
-	F12 = mult[1] * table[1];
+	F12 = mult[33] * table[1];
-	F20 = mult[2] * table[2];
+	F20 = mult[34] * table[2];
-	F28 = mult[3] * table[3];
+	F28 = mult[35] * table[3];
-	F36 = mult[4] * table[4];
+	F36 = mult[36] * table[4];
-	F44 = mult[5] * table[5];
+	F44 = mult[37] * table[5];
-	F52 = mult[6] * table[6];
+	F52 = mult[38] * table[6];
-	F60 = mult[7] * table[7];
+	F60 = mult[39] * table[7];
 	mult += 8;
 	table = &(fftTable[input[5] << 3]);
-	F5 = mult[0] * table[0];
+	F5  = mult[40] * table[0];
-	F13 = mult[1] * table[1];
+	F13 = mult[41] * table[1];
-	F21 = mult[2] * table[2];
+	F21 = mult[42] * table[2];
-	F29 = mult[3] * table[3];
+	F29 = mult[43] * table[3];
-	F37 = mult[4] * table[4];
+	F37 = mult[44] * table[4];
-	F45 = mult[5] * table[5];
+	F45 = mult[45] * table[5];
-	F53 = mult[6] * table[6];
+	F53 = mult[46] * table[6];
-	F61 = mult[7] * table[7];
+	F61 = mult[47] * table[7];
 	mult += 8;
 	table = &(fftTable[input[6] << 3]);
-	F6 = mult[0] * table[0];
+	F6  = mult[48] * table[0];
-	F14 = mult[1] * table[1];
+	F14 = mult[49] * table[1];
-	F22 = mult[2] * table[2];
+	F22 = mult[50] * table[2];
-	F30 = mult[3] * table[3];
+	F30 = mult[51] * table[3];
-	F38 = mult[4] * table[4];
+	F38 = mult[52] * table[4];
-	F46 = mult[5] * table[5];
+	F46 = mult[53] * table[5];
-	F54 = mult[6] * table[6];
+	F54 = mult[54] * table[6];
-	F62 = mult[7] * table[7];
+	F62 = mult[55] * table[7];
 	mult += 8;
 	table = &(fftTable[input[7] << 3]);
-	F7 = mult[0] * table[0];
+	F7  = mult[56] * table[0];
-	F15 = mult[1] * table[1];
+	F15 = mult[57] * table[1];
-	F23 = mult[2] * table[2];
+	F23 = mult[58] * table[2];
-	F31 = mult[3] * table[3];
+	F31 = mult[59] * table[3];
-	F39 = mult[4] * table[4];
+	F39 = mult[60] * table[4];
-	F47 = mult[5] * table[5];
+	F47 = mult[61] * table[5];
-	F55 = mult[6] * table[6];
+	F55 = mult[62] * table[6];
-	F63 = mult[7] * table[7];
+	F63 = mult[63] * table[7];
   #define ADD_SUB( a, b ) \
   { \
      int temp = b; \
      b = a - b; \
      a = a + temp; \
   }
   #define Q_REDUCE( a ) \
      ( ( (a) & 0xff ) - ( (a) >> 8 ) )
 /*
   for ( int i = 0; i < 8; i++ )
@@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
   }
 */
 	// Second loop unrolling:
 	// Iteration 0:
 	ADD_SUB(F0, F1);
@@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	output[47] = Q_REDUCE(F61);
 	output[55] = Q_REDUCE(F62);
 	output[63] = Q_REDUCE(F63);
   #undef ADD_SUB
   #undef Q_REDUCE
 #endif  // AVX2 elif SSE4.1 else
 }
 // Calculates the FFT part of SWIFFT.
@@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
 // - m: the input size divided by 64.
 // - output: will store the result.
 // - a: the coefficients in the sum. Of size 64 * m.
-void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
+void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
                const swift_int16_t *a )
 {
 	int i, j;
-	swift_int32_t result[N];
+	swift_int32_t result[N] __attribute__ ((aligned (64)));
 	register swift_int16_t carry = 0;
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   __m512i *res = (__m512i*)result;
   for ( j = 0; j < N/16; ++j )
   {
      __m512i sum = _mm512_setzero_si512();
      const __m512i *f = (__m512i*)input + j;
      const __m512i *k = (__m512i*)a + j;
      for ( i = 0; i < m; i++, f += N/16, k += N/16 )
         sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
      res[j] = sum;
   }
 #elif defined(__AVX2__)
   __m256i *res = (__m256i*)result;
   for ( j = 0; j < N/8; ++j )
   {
      __m256i sum = _mm256_setzero_si256();
      const __m256i *f = (__m256i*)input + j;
      const __m256i *k = (__m256i*)a + j;
      for ( i = 0; i < m; i++, f += N/8, k += N/8 )
         sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
      res[j] = sum;
   }
 #elif defined(__SSE4_1__)
   __m128i *res = (__m128i*)result;
   for ( j = 0; j < N/4; ++j )
   {
      __m128i sum = _mm_setzero_si128();
      const __m128i *f = (__m128i*)input + j;
      const __m128i *k = (__m128i*)a + j;
      for ( i = 0; i < m; i++, f += N/4, k += N/4 )
         sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
      res[j] = sum;
   }
 #else
 	for (j = 0; j < N; ++j)
 	{
 		register swift_int32_t sum = 0;
 		const register swift_int32_t *f = input + j;
 		const register swift_int16_t *k = a + j;
 		for (i = 0; i < m; i++, f += N,k += N)
 			sum += (*f) * (*k);
 		result[j] = sum;
 	}
 #endif
 	for (j = 0; j < N; ++j)
 		result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;
@@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
 	int i;
 	// Will store the result of the FFT parts:
-	swift_int32_t fftOut[N * M];
+	swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
-	unsigned char intermediate[N * 3 + 8];
+	unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
 	unsigned char carry0,carry1,carry2;
 	// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
@@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
   int i;
   // Will store the result of the FFT parts:
-   swift_int32_t fftOut[N * M];
+   swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
-   unsigned char intermediate[N * 3 + 8];
+   unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
   unsigned char carry0,carry1,carry2;
   // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.18.0'
+PACKAGE_VERSION='3.18.1'
-PACKAGE_STRING='cpuminer-opt 3.18.0'
+PACKAGE_STRING='cpuminer-opt 3.18.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.18.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.18.1 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.18.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.18.1:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.18.0
+cpuminer-opt configure 3.18.1
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.18.0, which was
+It was created by cpuminer-opt $as_me 3.18.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.18.0'
+ VERSION='3.18.1'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.18.0, which was
+This file was extended by cpuminer-opt $as_me 3.18.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.18.0
+cpuminer-opt config.status 3.18.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.18.0])
+AC_INIT([cpuminer-opt], [3.18.1])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -2083,7 +2083,8 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                           / ( opt_target_factor * opt_diff_factor );
   diff_to_hash( g_work->target, g_work->targetdiff );
-   // Increment extranonce2
+   // Pre increment extranonce2 in case of being called again before receiving
   // a new job
   for ( int t = 0;
         t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] );
         t++ );
@@ -2103,20 +2104,12 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   pthread_mutex_unlock( &stats_lock );
   if ( !opt_quiet )
   {
      int mismatch = submitted_share_count
         - ( accepted_share_count + stale_share_count + rejected_share_count );
      if ( mismatch )
         applog(LOG_INFO, CL_LBL "%d Submitted share pending, maybe stale" CL_N, submitted_share_count );
   }
   if ( stratum_diff != sctx->job.diff )
      applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
                        sctx->job.diff, sctx->block_height, g_work->job_id );
   else if ( last_block_height != sctx->block_height )
-      applog( LOG_BLUE, "New Block %d, Job %s",
+      applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
-                        sctx->block_height, g_work->job_id );
+                        sctx->block_height, net_diff, g_work->job_id );
   else if ( g_work->job_id && new_job )
      applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
                         sctx->block_height, net_diff, g_work->job_id );
@@ -2173,7 +2166,6 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
               {
                  double net_hr = nd / net_ttf;
                  char net_hr_units[4] = {0};
                  scale_hash_for_display ( &net_hr, net_hr_units );
                  applog2( LOG_INFO, "Net hash rate (est) %.2f %sh/s",
                                     net_hr, net_hr_units );
@@ -2182,6 +2174,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
         }  // hr > 0
      } // !quiet
   }  // new diff/block
   if ( new_job && !opt_quiet )
   {
      int mismatch = submitted_share_count - ( accepted_share_count
                                             + stale_share_count
                                             + rejected_share_count );
      if ( mismatch )
         applog( LOG_INFO,
                 CL_LBL "%d Submitted share pending, maybe stale" CL_N,
                 submitted_share_count );
   }
 }
 static void *miner_thread( void *userdata )
@@ -3970,6 +3973,7 @@ int main(int argc, char *argv[])
   gettimeofday( &last_submit_time, NULL );
   memcpy( &five_min_start, &last_submit_time, sizeof (struct timeval) );
   memcpy( &session_start, &last_submit_time, sizeof (struct timeval) );
   memcpy( &total_hashes_time, &last_submit_time, sizeof (struct timeval) );
   pthread_mutex_unlock( &stats_lock );
   applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -1,7 +1,7 @@
 #if !defined(SIMD_256_H__)
 #define SIMD_256_H__ 1
-#if defined(__AVX2__)
+//#if defined(__AVX2__)
 /////////////////////////////////////////////////////////////////////
 //
@@ -14,7 +14,9 @@
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
-// Used instead if casting.
+#if defined(__AVX__)
 // Used instead of casting.
 typedef union
 {
   __m256i m256;
@@ -23,6 +25,28 @@ typedef union
   uint32_t u32[8];
 } __attribute__ ((aligned (32))) m256_ovly;
 //
 // Pointer casting
 // p = any aligned pointer
 // returns p as pointer to vector type, not very useful
 #define castp_m256i(p) ((__m256i*)(p))
 // p = any aligned pointer
 // returns *p, watch your pointer arithmetic
 #define cast_m256i(p) (*((__m256i*)(p)))
 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_m256i(p,i) (((__m256i*)(p))[(i)])
 // p = any aligned pointer, o = scaled offset
 // returns pointer p+o
 #define casto_m256i(p,o) (((__m256i*)(p))+(o))
 #endif
 #if defined(__AVX2__)
 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
@@ -91,26 +115,6 @@ static inline __m256i mm256_neg1_fn()
 #define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
 #define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
 //
 // Pointer casting
 // p = any aligned pointer
 // returns p as pointer to vector type, not very useful
 #define castp_m256i(p) ((__m256i*)(p))
 // p = any aligned pointer
 // returns *p, watch your pointer arithmetic
 #define cast_m256i(p) (*((__m256i*)(p)))
 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_m256i(p,i) (((__m256i*)(p))[(i)])
 // p = any aligned pointer, o = scaled offset
 // returns pointer p+o
 #define casto_m256i(p,o) (((__m256i*)(p))+(o))
 //
 // Memory functions
 // n = number of 256 bit (32 byte) vectors
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -535,7 +535,6 @@ static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
 // Rotate 256 bit lanes by one 64 bit element
 #define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
 #define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )
 // Rotate 256 bit lanes by one 32 bit element
@@ -611,9 +610,6 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 // shufl2r is 2 input ...
 // Drop macros? They can easilly be rebuilt using shufl2 functions
 // add shuflr shufll functions performing rotate, returning first arg
 // They're faster than doing both, when both not needed.
 // Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
 // rotated v1 
 // visually confusing for shif2r because of arg order. First arg is always