v3.18.1

2025-09-17 23:44:27 +00:00 · 2021-10-10 22:50:19 -04:00
parent 2cd1507c2e
commit 47cc5dcff5
14 changed files with 2057 additions and 2827 deletions
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -146,6 +146,119 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
      output[i] = bswap_32( ostate[i] );
 }

+#if defined(__SHA__)
+
+static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, 
+                    const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
+                    uint32_t *ostate0, uint32_t *ostate1 )
+{
+   uint32_t ihash0[8], ihash1[8], pad0[16], pad1[16];
+   int i;
+
+   memcpy( pad0, key0 + 16, 16 );
+   memcpy( pad0 + 4, keypad, 48 );
+   memcpy( pad1, key1 + 16, 16 );
+   memcpy( pad1 + 4, keypad, 48 );
+
+   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
+		               tstate0, tstate1 );
+
+   memcpy( ihash0, tstate0, 32 );
+   memcpy( ihash1, tstate1, 32 );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      pad0[i] = ihash0[i] ^ 0x5c5c5c5c;
+      pad1[i] = ihash1[i] ^ 0x5c5c5c5c;
+   }
+   for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
+
+   sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
+                               sha256_initial_state, sha256_initial_state );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      pad0[i] = ihash0[i] ^ 0x36363636;
+      pad1[i] = ihash1[i] ^ 0x36363636;
+   }
+   for ( ; i < 16; i++ )      pad0[i] = pad1[i] = 0x36363636;
+
+   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, 
+                               sha256_initial_state, sha256_initial_state );
+}
+
+static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
+            const uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
+            const uint32_t *salt0, const uint32_t *salt1, uint32_t *output0,
+            uint32_t *output1 )
+{
+   uint32_t istate0[8], istate1[8], ostateb0[8], ostateb1[8];
+   uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
+   int i, j;
+
+   sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
+                               tstate0, tstate1 );
+
+   memcpy( ibuf0, salt0 + 16, 16 );
+   memcpy( ibuf0 + 5, innerpad, 44 );
+   memcpy( obuf0 + 8, outerpad, 32 );
+   memcpy( ibuf1, salt1 + 16, 16 );
+   memcpy( ibuf1 + 5, innerpad, 44 );
+   memcpy( obuf1 + 8, outerpad, 32 );
+
+   for ( i = 0; i < 4; i++ )
+   {
+      memcpy( obuf0, istate0, 32 );
+      memcpy( obuf1, istate1, 32 );
+      ibuf0[4] = ibuf1[4] = i + 1;
+
+      sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
+                                  obuf0, obuf1 );
+      sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
+                                  ostate0, ostate1 );
+      
+      for ( j = 0; j < 8; j++ )
+      {
+         output0[ 8*i + j ] = bswap_32( ostateb0[j] );
+         output1[ 8*i + j ] = bswap_32( ostateb1[j] );
+      }
+   }
+}
+
+static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
+                    uint32_t *tstate1, uint32_t *ostate0, uint32_t *ostate1,
+                    const uint32_t *salt0, const uint32_t *salt1,
+                    uint32_t *output0, uint32_t *output1 )
+{
+   uint32_t buf0[16], buf1[16];
+   int i;
+
+   sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
+                               tstate0, tstate1 );   
+   sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
+                               tstate0, tstate1 );
+   sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
+                               tstate0, tstate1 );
+
+   memcpy( buf0, tstate0, 32 );
+   memcpy( buf0 + 8, outerpad, 32 );
+   memcpy( buf1, tstate1, 32 );
+   memcpy( buf1 + 8, outerpad, 32 );
+
+   sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
+                               ostate0, ostate1 );
+
+   for ( i = 0; i < 8; i++ )
+   {
+      output0[i] = bswap_32( ostate0[i] );
+      output1[i] = bswap_32( ostate1[i] );
+   }
+}
+
+
+
+#endif
+
 #ifdef HAVE_SHA256_4WAY

 static const uint32_t keypad_4way[4 * 12] = {
@@ -643,10 +756,10 @@ static bool scrypt_N_1_1_256(const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[8 * 8];
-   uint32_t _ALIGN(128) ostate[8 * 8];
-   uint32_t _ALIGN(128) W[8 * 32];
-   uint32_t _ALIGN(128) X[8 * 32];
+   uint32_t _ALIGN(128) tstate[ 8*8 ];
+   uint32_t _ALIGN(128) ostate[ 8*8 ];
+   uint32_t _ALIGN(128) W[ 8*32 ];
+   uint32_t _ALIGN(128) X[ 8*32 ];
   uint32_t *V = (uint32_t*)scratchpad;

   intrlv_8x32( W, input,    input+ 20, input+ 40, input+ 60,
@@ -658,53 +771,45 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   PBKDF2_SHA256_80_128_8way( tstate, ostate, W, W );

   dintrlv_8x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224, W, 1024 );
+   
+   if ( opt_param_n > 0x4000 )
+   {
+      scrypt_core_simd128_3buf( X,     V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+ 96, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+192, V, N );
+   }
+   else
+   {
+      intrlv_2x128( W,     X,     X+ 32, 1024 );
+      intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
+      intrlv_2x128( W+128, X+128, X+160, 1024 );
+      intrlv_2x128( W+192, X+192, X+224, 1024 );
+      scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+      dintrlv_2x128( X,     X+ 32, W,     1024 );
+      dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
+      dintrlv_2x128( X+128, X+160, W+128, 1024 );
+      dintrlv_2x128( X+192, X+224, W+192, 1024 );
+   }

+      

   // SCRYPT CORE

-
-   // AVX512
-
-/*
-   // AVX512 16 way working
-   intrlv_16x32( W, X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
-                    X+256, X+256+32, X+256+64, X+256+96, X+256+128,
-                    X+256+160, X+256+192, X+256+224, 1024 );
-
-   scrypt_core_16way( (__m512i*)W , (__m512i*)V, N );
-
-   dintrlv_16x32( X, X+32, X+64, X+96, X+128, X+160, X+192, X+224,
-                  X+256, X+256+32, X+256+64, X+256+96, X+256+128, 
-                  X+256+160, X+256+192, X+256+224, W, 1024 );
-*/
-/*
-   // AVX512 working
-   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
-   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
-   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); 
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
-   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
-   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
-*/   
-/*
-   // AVX512, not working, very slow
-   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
-   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
-   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
-   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
-   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
-*/
-
  // AVX2

-/*
+
   // AVX2   
   // disable de/interleave for testing.
-   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
-*/
+//   scrypt_core_8way( (__m256i*)W , (__m256i*)V, N );
+

 /*
   // AVX2 working
@@ -714,23 +819,18 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   intrlv_2x128( W+192, X+192, X+224, 1024 );

   // working
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+//   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );

   // working
-   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
-
-   // working
-//   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+   scrypt_core_2way_simd128( (__m256i*)(W+ 64), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );

   dintrlv_2x128( X,     X+ 32, W,     1024 );
   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
@@ -745,18 +845,10 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   intrlv_2x32( W+128, X+128, X+160, 1024 );
   intrlv_2x32( W+192, X+192, X+224, 1024 );

-   // working, deprecated, not up to data
-//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
-
-     // deprecated, not up to date
-//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
-
   // working
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );

 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -813,19 +905,13 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+192, V, N );
 */
-
+/**************
   scrypt_core_simd128_3buf( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_3buf( X+ 96, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+192, V, N );
-
-/*
-   // SSE2 working
-   scrypt_core_simd128_4buf( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+128, V, N );
-*/
+*************/


   if ( work_restart[thrid].restart ) return 0;
@@ -868,6 +954,39 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
                  W, 1024 );


+   if ( opt_param_n > 0x4000 )
+   {
+      scrypt_core_simd128_3buf( X,     V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+ 96, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+192, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+256, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_3buf( X+352, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+448, V, N );
+   }
+   else
+   {
+      intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
+      intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+      intrlv_4x128( W+256, X+256, X+288, X+320, X+352, 1024 );
+      intrlv_4x128( W+384, X+384, X+416, X+448, X+480, 1024 );
+      scrypt_core_4way_simd128( (__m512i*) W,      (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+256), (__m512i*)V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_4way_simd128( (__m512i*)(W+384), (__m512i*)V, N );
+      dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
+      dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+      dintrlv_4x128( X+256, X+288, X+320, X+352, W+256, 1024 );
+      dintrlv_4x128( X+384, X+416, X+448, X+480, W+384, 1024 );
+   }
+
   // SCRYPT CORE


@@ -888,23 +1007,40 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   // AVX512 working
   intrlv_4x32( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
+   intrlv_4x32( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
+   dintrlv_4x32( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
+   dintrlv_4x32( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */
 /*
-   // AVX512, not working, very slow
+   // AVX512, working
   intrlv_4x128( W,     X,     X+ 32, X+ 64, X+ 96, 1024 );
   intrlv_4x128( W+128, X+128, X+160, X+192, X+224, 1024 );
+   intrlv_4x128( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_4x128( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
   scrypt_core_4way_simd128( (__m512i*)W,      (__m512i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_4way_simd128( (__m512i*)(W+128), (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+256),   (__m512i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_4way_simd128( (__m512i*)(W+256+128), (__m512i*)V, N );
   dintrlv_4x128( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
   dintrlv_4x128( X+128, X+160, X+192, X+224, W+128, 1024 );
+   dintrlv_4x128( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
+   dintrlv_4x128( X+256+128, X+256+160, X+256+192, X+256+224, W+256+128, 1024 );
 */

+
  // AVX2

 /*
@@ -919,16 +1055,19 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   intrlv_2x128( W+ 64, X+ 64, X+ 96, 1024 );
   intrlv_2x128( W+128, X+128, X+160, 1024 );
   intrlv_2x128( W+192, X+192, X+224, 1024 );
-
-   // working
-//   scrypt_core_2way_simd128_3buf( (__m256i*) W,      (__m256i*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_2way_simd128( (__m256i*)(W+192),      (__m256i*)V, N );
+   intrlv_2x128( W+256,     X+256,     X+256+ 32, 1024 );
+   intrlv_2x128( W+256+ 64, X+256+ 64, X+256+ 96, 1024 );
+   intrlv_2x128( W+256+128, X+256+128, X+256+160, 1024 );
+   intrlv_2x128( W+256+192, X+256+192, X+256+224, 1024 );

   // working
   scrypt_core_2way_simd128_2buf( (__m256i*) W,      (__m256i*)V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_2way_simd128_2buf( (__m256i*)(W+128), (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256),      (__m256i*)V, N );
+   if ( work_restart[thrid].restart ) return 0;
+   scrypt_core_2way_simd128_2buf( (__m256i*)(W+256+128), (__m256i*)V, N );

   // working
 //   scrypt_core_2way_simd128( (__m256i*) W,      (__m256i*)V, N );
@@ -938,11 +1077,23 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
 //   scrypt_core_2way_simd128( (__m256i*)(W+128), (__m256i*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_2way_simd128( (__m256i*)(W+192), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256),      (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+ 64), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+128), (__m256i*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_2way_simd128( (__m256i*)(W+256+192), (__m256i*)V, N );

   dintrlv_2x128( X,     X+ 32, W,     1024 );
   dintrlv_2x128( X+ 64, X+ 96, W+ 64, 1024 );
   dintrlv_2x128( X+128, X+160, W+128, 1024 );
   dintrlv_2x128( X+192, X+224, W+192, 1024 );
+   dintrlv_2x128( X+256,     X+256+ 32, W+256,     1024 );
+   dintrlv_2x128( X+256+ 64, X+256+ 96, W+256+ 64, 1024 );
+   dintrlv_2x128( X+256+128, X+256+160, W+256+128, 1024 );
+   dintrlv_2x128( X+256+192, X+256+224, W+256+192, 1024 );
 */

 /*
@@ -952,18 +1103,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   intrlv_2x32( W+128, X+128, X+160, 1024 );
   intrlv_2x32( W+192, X+192, X+224, 1024 );

-   // working, deprecated, not up to data
-//   scrypt_core_simd128_2way_4buf( (uint64_t*)W,  (uint64_t*)V, N );
-
-     // deprecated, not up to date
-//   scrypt_core_simd128_2way_3buf( (uint64_t*)  W,       (uint64_t*)V, N );
-//   if ( work_restart[thrid].restart ) return 0;
-//   scrypt_core_simd128_2way(      (uint64_t*)( W+192 ), (uint64_t*)V, N );
-
   // working
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
 //   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)  W,       (uint64_t*)V, N );
+//   if ( work_restart[thrid].restart ) return 0;
+//   scrypt_core_simd128_2way_2buf( (uint64_t*)( W+128 ), (uint64_t*)V, N );

 //   scrypt_core_simd128_2way( (uint64_t*)  W,       (uint64_t*)V, N );
 //   if ( work_restart[thrid].restart ) return 0;
@@ -1043,7 +1189,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+448, V, N );
 */
-
+/***************
   scrypt_core_simd128_3buf( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_3buf( X+ 96, V, N );
@@ -1055,17 +1201,7 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   scrypt_core_simd128_3buf( X+352, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+448, V, N );
-
-/*
-   // SSE2 working
-   scrypt_core_simd128_4buf( X,     V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+128, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+256, V, N );
-   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4buf( X+384, V, N );
-*/
+********************/
 /*
   scrypt_core_3way( X,     V, N );
   if ( work_restart[thrid].restart ) return 0;
@@ -1102,6 +1238,31 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,

 #if defined(__SHA__)

+static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
+           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
+{
+    uint32_t _ALIGN(128) tstate[ 2*8 ];
+    uint32_t _ALIGN(128) ostate[ 2*8 ];
+    uint32_t _ALIGN(128) W[ 2*32 ];
+    uint32_t *V = (uint32_t*)scratchpad;
+
+    memcpy( tstate,    midstate, 32 );
+    memcpy( tstate+ 8, midstate, 32 );
+
+    HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
+                                  ostate, ostate+8 );
+    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
+                                   input, input+20,  W, W+32 );
+
+    scrypt_core_simd128_2buf( W, V, N );
+    if ( work_restart[thrid].restart ) return 0;
+
+    PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
+                                   output, output+8 );
+
+   return 1;
+}
+
 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
@@ -1149,8 +1310,6 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
   scrypt_core_simd128( W+96, V, N );
 */

-   // working
-//   scrypt_core_simd128_4buf( W, V, N );

   if ( work_restart[thrid].restart ) return 0;

@@ -1171,10 +1330,9 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
 static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
           uint32_t *midstate, unsigned char *scratchpad, int N, int thrid )
 {
-   uint32_t _ALIGN(128) tstate[4 * 8];
-   uint32_t _ALIGN(128) ostate[4 * 8];
-   uint32_t _ALIGN(128) W[4 * 32];
-   uint32_t _ALIGN(128) X[4 * 32];
+   uint32_t _ALIGN(128) tstate[ 4*8 ];
+   uint32_t _ALIGN(128) ostate[ 4*8 ];
+   uint32_t _ALIGN(128) W[ 4*32 ];
   uint32_t *V = (uint32_t*)scratchpad;

   intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
@@ -1184,7 +1342,21 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
   HMAC_SHA256_80_init_4way(W, tstate, ostate);
   PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);

-   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+   if ( opt_param_n > 0x4000 )
+   {
+      uint32_t _ALIGN(128) X[ 4*32 ];
+      dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );
+      scrypt_core_simd128_2buf( X, V, N );
+      if ( work_restart[thrid].restart ) return 0;
+      scrypt_core_simd128_2buf( X+64, V, N );
+      intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+   }
+   else
+      scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+
+
+
+//   dintrlv_4x32( X, X+32, X+64, X+96, W, 1024 );

 ////// SCRYPT_CORE   

@@ -1202,35 +1374,23 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128( X+96, V, N );
 */
-   
+/*   
   // working, double buffered linear simd, best for n2
   scrypt_core_simd128_2buf( X, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128_2buf( X+64, V, N );
-  
+*/  
 /*
   scrypt_core_simd128_3buf( X, V, N );
   if ( work_restart[thrid].restart ) return 0;
   scrypt_core_simd128( X+96, V, N );
 */
   
-   // working
-//   scrypt_core_simd128_4buf( X, V, N );
-
-
-/* 
-   // original
-   scrypt_core(X + 0 * 32, V, N);
-	scrypt_core(X + 1 * 32, V, N);
-	scrypt_core(X + 2 * 32, V, N);
-	scrypt_core(X + 3 * 32, V, N);
-*/
-
 ////////////////////////////////

   if ( work_restart[thrid].restart ) return 0;

-   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
+//   intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );

   PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);

@@ -1247,22 +1407,22 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
 {
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
-	uint32_t midstate[8];
-	uint32_t n = pdata[19] - 1;
+   uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
+   uint32_t midstate[8];
+   uint32_t n = pdata[19] - 1;
   int thr_id = mythr->id;  
   int throughput = scrypt_throughput;
-	int i;
+   int i;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	
-	for ( i = 0; i < throughput; i++ )
-		memcpy( data + i * 20, pdata, 80 );
+   for ( i = 0; i < throughput; i++ )
+      memcpy( data + i * 20, pdata, 80 );

   sha256_transform_le( midstate, data, sha256_initial_state );

-	do {
+   do {
      bool rc = true;
-		for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;
+      for ( i = 0; i < throughput; i++ ) data[ i*20 + 19 ] = ++n;

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
      if ( throughput == 16 )
@@ -1276,7 +1436,7 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                                     opt_param_n, thr_id );
      else
 #endif
-      if ( throughput == 4 )
+      if ( throughput == 4 ) // slower on Ryzen than 8way
 #if defined(__SHA__)
         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, scratchbuf,
                                         opt_param_n, thr_id );
@@ -1284,10 +1444,17 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
         rc = scrypt_N_1_1_256_4way( data, hash, midstate, scratchbuf,
                                     opt_param_n, thr_id );
 #endif
+#if defined(__SHA__)
      else
+      if (throughput == 2 )  // slower on Ryzen than 4way_sha & 8way
+         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, scratchbuf,
+                                         opt_param_n, thr_id );
+#endif         
+      else  // should never get here
         rc = scrypt_N_1_1_256( data, hash, midstate, scratchbuf,
                                opt_param_n, thr_id );

+      // test the hash
      if ( rc )
      for ( i = 0; i < throughput; i++ )
      {
@@ -1319,11 +1486,11 @@ bool scrypt_miner_thread_init( int thr_id )

 bool register_scrypt_algo( algo_gate_t* gate )
 {
-#if defined(__SHA__)
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-#else
+//#if defined(__SHA__)
+//   gate->optimizations = SSE2_OPT | SHA_OPT;
+//#else
   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
-#endif
+//#endif
   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_scrypt;
   opt_target_factor = 65536.0;
@@ -1332,16 +1499,29 @@ bool register_scrypt_algo( algo_gate_t* gate )

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   scrypt_throughput = 16;
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
+      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   else      
+      scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+
+/* SHA is slower than AVX2 on Ryzen
 #elif defined(__SHA__)
   scrypt_throughput = 4;
   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+*/
+
 #elif defined(__AVX2__)
   scrypt_throughput = 8;   
-   scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   if ( opt_param_n > 0x4000 )
+      scratchbuf_size = opt_param_n * 3 * 128;  // 3 buf
+   else
+      scratchbuf_size = opt_param_n * 2 * 128;  // 2 way
 #else
   scrypt_throughput = 4;
+   if ( opt_param_n > 0x4000 )
   scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
+   else
+   scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
 #endif

   char t_units[4] = {0};
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -51,7 +51,6 @@ typedef struct {
   __m128i buf[64>>2];
   __m128i val[8];
   uint32_t count_high, count_low;
-   bool initialized;
 } sha256_4way_context __attribute__ ((aligned (64)));

 void sha256_4way_init( sha256_4way_context *sc );
@@ -74,7 +73,6 @@ typedef struct {
   __m256i buf[64>>2];
   __m256i val[8];
   uint32_t count_high, count_low;
-   bool initialized;
 } sha256_8way_context __attribute__ ((aligned (128)));

 void sha256_8way_init( sha256_8way_context *sc );
@@ -96,7 +94,6 @@ typedef struct {
   __m512i buf[64>>2];
   __m512i val[8];
   uint32_t count_high, count_low;
-   bool initialized;
 } sha256_16way_context __attribute__ ((aligned (128)));

 void sha256_16way_init( sha256_16way_context *sc );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -70,6 +70,8 @@ extern "C"{
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
+   const __m256i FIVE  = _mm256_set1_epi32( 5 ); \
+   const __m256i THREE = _mm256_set1_epi32( 3 ); \
   sph_u32 Wlow, Whigh;

 #define READ_STATE8(state) do \
@@ -314,8 +316,7 @@ do { \
            _mm256_andnot_si256( xb3, xb2 ), \
            _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
               _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \
-                                   _mm256_set1_epi32(5UL) ) ), \
-               _mm256_set1_epi32(3UL) ) ) ); \
+                                   FIVE ) ), THREE ) ) ); \
   xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \
 } while (0)

@@ -667,7 +668,9 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	        C8, C9, CA, CB, CC, CD, CE, CF; \
 	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
-	sph_u32 Wlow, Whigh;
+   const __m128i FIVE  = _mm_set1_epi32( 5 ); \
+   const __m128i THREE = _mm_set1_epi32( 3 ); \
+   sph_u32 Wlow, Whigh;

 #define READ_STATE(state) do \
 { \
@@ -931,8 +934,8 @@ do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
            _mm_andnot_si128( xb3, xb2 ), \
            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
-                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), FIVE ) \
+                   ) ), THREE ) ) ) ); \
   xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
 } while (0)

--- a/algo/swifftx/inttypes.h
+++ b/algo/swifftx/inttypes.h
@@ -18,16 +18,20 @@
 #ifndef __INTTYPES_H_
 #define __INTTYPES_H_

+#include <stdint.h>
+
 /* Use [u]intN_t if you need exactly N bits.
  XXX - doesn't handle the -mint8 option.  */

 typedef signed char swift_int8_t;
 typedef unsigned char swift_uint8_t;

- typedef int swift_int16_t;
+ typedef int32_t swift_int16_t;
+// typedef int swift_int16_t;
 typedef unsigned int swift_uint16_t;

- typedef long swift_int32_t;
+ typedef int32_t swift_int32_t;
+// typedef long swift_int32_t;
 typedef unsigned long swift_uint32_t;

 typedef long long swift_int64_t;
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -18,6 +18,8 @@
 //#include "stdbool.h"
 #include <memory.h>

+#include "simd-utils.h"
+
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // Constants and static tables portion.
 ///////////////////////////////////////////////////////////////////////////////////////////////
@@ -49,20 +51,20 @@
 // - A: the first operand. After the operation stores the sum of the two operands.
 // - B: the second operand. After the operation stores the difference between the first and the
 //   second operands.
-#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}
+//#define ADD_SUB(A, B) {register int temp = (B); B = ((A) - (B)); A = ((A) + (temp));}

 // Quickly reduces an integer modulo 257.
 //
 // Parameters:
 // - A: the input.
-#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))
+//#define Q_REDUCE(A) (((A) & 0xff) - ((A) >> 8))

 // Since we need to do the setup only once, this is the indicator variable:
 static bool wasSetupDone = false;

 // This array stores the powers of omegas that correspond to the indices, which are the input
 // values. Known also as the "outer FFT twiddle factors".
-swift_int16_t multipliers[N];
+swift_int16_t multipliers[N] __attribute__ ((aligned (64)));

 // This array stores the powers of omegas, multiplied by the corresponding values.
 // We store this table to save computation time.
@@ -72,14 +74,14 @@ swift_int16_t multipliers[N];
 // compression function, i is between 0 and 31, x_i is a 64-bit value.
 // One can see the formula for this (intermediate) stage in the SWIFFT FSE 2008 paper --
 // formula (2), section 3, page 6.
-swift_int16_t fftTable[256 * EIGHTH_N];
+swift_int16_t fftTable[256 * EIGHTH_N] __attribute__ ((aligned (64)));

 // The A's we use in SWIFFTX shall be random elements of Z_257.
 // We generated these A's from the decimal expansion of PI as follows:  we converted each
 // triple of digits into a decimal number d. If d < (257 * 3) we used (d % 257) for the next A
 // element, otherwise move to the next triple of digits in the expansion. This guarntees that
 // the A's are random, provided that PI digits are.
-const swift_int16_t As[3 * M * N] =
+const swift_int16_t As[3 * M * N] __attribute__ ((aligned (64))) =
 {141,  78, 139,  75, 238, 205, 129, 126,  22, 245, 197, 169, 142, 118, 105,  78,
  50, 149,  29, 208, 114,  34,  85, 117,  67, 148,  86, 256,  25,  49, 133,  93,
  95,  36,  68, 231, 211, 102, 151, 128, 224, 117, 193,  27, 102, 187,   7, 105,
@@ -636,9 +638,202 @@ void InitializeSWIFFTX()
 	wasSetupDone = true;
 }

+// In the original code the F matrix is rotated so it was not aranged
+// the same as all the other data. Rearanging F to match all the other
+// data made vectorizing possible, the compiler probably could have been
+// able to auto-vectorize with proper data organisation.
+// Also in the original code the custom 16 bit data types are all now 32
+// bit int32_t regardless of the type name.
+//
 void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 {
-	swift_int16_t *mult = multipliers;
+#if defined(__AVX2__)
+
+   __m256i F[8] __attribute__ ((aligned (64)));
+   __m256i *mul = (__m256i*)multipliers;
+   __m256i *out = (__m256i*)output;
+   __m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
+
+   F[0] = _mm256_mullo_epi32( mul[0], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
+   F[1] = _mm256_mullo_epi32( mul[1], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
+   F[2] = _mm256_mullo_epi32( mul[2], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
+   F[3] = _mm256_mullo_epi32( mul[3], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
+   F[4] = _mm256_mullo_epi32( mul[4], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
+   F[5] = _mm256_mullo_epi32( mul[5], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
+   F[6] = _mm256_mullo_epi32( mul[6], *tbl );
+   tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
+   F[7] = _mm256_mullo_epi32( mul[7], *tbl );
+
+   #define ADD_SUB( a, b ) \
+   { \
+      __m256i tmp = b; \
+      b = _mm256_sub_epi32( a, b ); \
+      a = _mm256_add_epi32( a, tmp ); \
+   }
+   
+   ADD_SUB( F[0], F[1] );
+   ADD_SUB( F[2], F[3] );
+   ADD_SUB( F[4], F[5] );
+   ADD_SUB( F[6], F[7] );
+
+   F[3] = _mm256_slli_epi32( F[3], 4 );
+   F[7] = _mm256_slli_epi32( F[7], 4 );
+
+   ADD_SUB( F[0], F[2] );
+   ADD_SUB( F[1], F[3] );
+   ADD_SUB( F[4], F[6] );
+   ADD_SUB( F[5], F[7] );  
+
+   F[5] = _mm256_slli_epi32( F[5], 2 );
+   F[6] = _mm256_slli_epi32( F[6], 4 );
+   F[7] = _mm256_slli_epi32( F[7], 6 );
+
+   ADD_SUB( F[0], F[4] );
+   ADD_SUB( F[1], F[5] );
+   ADD_SUB( F[2], F[6] );
+   ADD_SUB( F[3], F[7] );
+
+   #undef ADD_SUB
+
+#if defined (__AVX512VL__) && defined(__AVX512BW__)   
+
+   #define Q_REDUCE( a ) \
+       _mm256_sub_epi32( _mm256_and_si256( a, \
+                 _mm256_movm_epi8( 0x11111111 ) ), _mm256_srai_epi32( a, 8 ) ) 
+
+#else   
+
+   #define Q_REDUCE( a ) \
+       _mm256_sub_epi32( _mm256_and_si256( a, \
+                   m256_const1_32( 0x000000ff ) ), _mm256_srai_epi32( a, 8 ) ) 
+
+#endif
+                          
+   out[0] = Q_REDUCE( F[0] );  
+   out[1] = Q_REDUCE( F[1] );                        
+   out[2] = Q_REDUCE( F[2] );                        
+   out[3] = Q_REDUCE( F[3] );                        
+   out[4] = Q_REDUCE( F[4] );                        
+   out[5] = Q_REDUCE( F[5] );                        
+   out[6] = Q_REDUCE( F[6] );                        
+   out[7] = Q_REDUCE( F[7] );
+
+   #undef Q_REDUCE
+
+#elif defined(__SSE4_1__)
+
+   __m128i F[16] __attribute__ ((aligned (64)));
+   __m128i *mul = (__m128i*)multipliers;
+   __m128i *out = (__m128i*)output;
+   __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
+
+   F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
+   F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
+   F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
+   F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
+   F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
+   F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
+   F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
+   F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
+   F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
+   F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
+   F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
+   F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
+   F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
+   F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
+   tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
+   F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
+   F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
+
+   #define ADD_SUB( a, b ) \
+   { \
+      __m128i tmp = b; \
+      b = _mm_sub_epi32( a, b ); \
+      a = _mm_add_epi32( a, tmp ); \
+   }
+
+   ADD_SUB( F[ 0], F[ 2] );
+   ADD_SUB( F[ 1], F[ 3] );
+   ADD_SUB( F[ 4], F[ 6] );
+   ADD_SUB( F[ 5], F[ 7] );
+   ADD_SUB( F[ 8], F[10] );
+   ADD_SUB( F[ 9], F[11] );
+   ADD_SUB( F[12], F[14] );
+   ADD_SUB( F[13], F[15] );
+
+   F[ 6] = _mm_slli_epi32( F[ 6], 4 );
+   F[ 7] = _mm_slli_epi32( F[ 7], 4 );
+   F[14] = _mm_slli_epi32( F[14], 4 );
+   F[15] = _mm_slli_epi32( F[15], 4 );
+
+   ADD_SUB( F[ 0], F[ 4] );
+   ADD_SUB( F[ 1], F[ 5] );
+   ADD_SUB( F[ 2], F[ 6] );
+   ADD_SUB( F[ 3], F[ 7] );
+   ADD_SUB( F[ 8], F[12] );
+   ADD_SUB( F[ 9], F[13] );
+   ADD_SUB( F[10], F[14] );
+   ADD_SUB( F[11], F[15] );
+
+   F[10] = _mm_slli_epi32( F[10], 2 );
+   F[11] = _mm_slli_epi32( F[11], 2 );
+   F[12] = _mm_slli_epi32( F[12], 4 );
+   F[13] = _mm_slli_epi32( F[13], 4 );
+   F[14] = _mm_slli_epi32( F[14], 6 );
+   F[15] = _mm_slli_epi32( F[15], 6 );
+   
+   ADD_SUB( F[ 0], F[ 8] );
+   ADD_SUB( F[ 1], F[ 9] );
+   ADD_SUB( F[ 2], F[10] );
+   ADD_SUB( F[ 3], F[11] );
+   ADD_SUB( F[ 4], F[12] );
+   ADD_SUB( F[ 5], F[13] );
+   ADD_SUB( F[ 6], F[14] );
+   ADD_SUB( F[ 7], F[15] );
+
+   #undef ADD_SUB
+
+   #define Q_REDUCE( a ) \
+      _mm_sub_epi32( _mm_and_si128( a, \
+                   m128_const1_32( 0x000000ff ) ), _mm_srai_epi32( a, 8 ) ) 
+
+   out[ 0] = Q_REDUCE( F[ 0] );
+   out[ 1] = Q_REDUCE( F[ 1] );
+   out[ 2] = Q_REDUCE( F[ 2] );
+   out[ 3] = Q_REDUCE( F[ 3] );
+   out[ 4] = Q_REDUCE( F[ 4] );
+   out[ 5] = Q_REDUCE( F[ 5] );
+   out[ 6] = Q_REDUCE( F[ 6] );
+   out[ 7] = Q_REDUCE( F[ 7] );
+   out[ 8] = Q_REDUCE( F[ 8] );
+   out[ 9] = Q_REDUCE( F[ 9] );
+   out[10] = Q_REDUCE( F[10] );
+   out[11] = Q_REDUCE( F[11] );
+   out[12] = Q_REDUCE( F[12] );
+   out[13] = Q_REDUCE( F[13] );
+   out[14] = Q_REDUCE( F[14] );
+   out[15] = Q_REDUCE( F[15] );
+
+   #undef Q_REDUCE
+
+#else   // < SSE4.1
+   
+   swift_int16_t *mult = multipliers;
+
+   // First loop unrolling:
+	register swift_int16_t *table = &(fftTable[input[0] << 3]);

 /*
   swift_int32_t F[64];
@@ -666,11 +861,8 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
                F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
                F60, F61, F62, F63;
   
-   // First loop unrolling:
-	register swift_int16_t *table = &(fftTable[input[0] << 3]);
-
-	F0 = mult[0] * table[0];
-	F8 = mult[1] * table[1];
+	F0  = mult[0] * table[0];
+	F8  = mult[1] * table[1];
 	F16 = mult[2] * table[2];
 	F24 = mult[3] * table[3];
 	F32 = mult[4] * table[4];
@@ -678,90 +870,93 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	F48 = mult[6] * table[6];
 	F56 = mult[7] * table[7];

-	mult += 8;
 	table = &(fftTable[input[1] << 3]);

-	F1 = mult[0] * table[0];
-	F9 = mult[1] * table[1];
-	F17 = mult[2] * table[2];
-	F25 = mult[3] * table[3];
-	F33 = mult[4] * table[4];
-	F41 = mult[5] * table[5];
-	F49 = mult[6] * table[6];
-	F57 = mult[7] * table[7];
+	F1  = mult[ 8] * table[0];
+	F9  = mult[ 9] * table[1];
+	F17 = mult[10] * table[2];
+	F25 = mult[11] * table[3];
+	F33 = mult[12] * table[4];
+	F41 = mult[13] * table[5];
+	F49 = mult[14] * table[6];
+	F57 = mult[15] * table[7];

-	mult += 8;
 	table = &(fftTable[input[2] << 3]);

-	F2 = mult[0] * table[0];
-	F10 = mult[1] * table[1];
-	F18 = mult[2] * table[2];
-	F26 = mult[3] * table[3];
-	F34 = mult[4] * table[4];
-	F42 = mult[5] * table[5];
-	F50 = mult[6] * table[6];
-	F58 = mult[7] * table[7];
+	F2  = mult[16] * table[0];
+	F10 = mult[17] * table[1];
+	F18 = mult[18] * table[2];
+	F26 = mult[19] * table[3];
+	F34 = mult[20] * table[4];
+	F42 = mult[21] * table[5];
+	F50 = mult[22] * table[6];
+	F58 = mult[23] * table[7];

-	mult += 8;
 	table = &(fftTable[input[3] << 3]);

-	F3 = mult[0] * table[0];
-	F11 = mult[1] * table[1];
-	F19 = mult[2] * table[2];
-	F27 = mult[3] * table[3];
-	F35 = mult[4] * table[4];
-	F43 = mult[5] * table[5];
-	F51 = mult[6] * table[6];
-	F59 = mult[7] * table[7];
+	F3  = mult[24] * table[0];
+	F11 = mult[25] * table[1];
+	F19 = mult[26] * table[2];
+	F27 = mult[27] * table[3];
+	F35 = mult[28] * table[4];
+	F43 = mult[29] * table[5];
+	F51 = mult[30] * table[6];
+	F59 = mult[31] * table[7];

-	mult += 8;
 	table = &(fftTable[input[4] << 3]);

-	F4 = mult[0] * table[0];
-	F12 = mult[1] * table[1];
-	F20 = mult[2] * table[2];
-	F28 = mult[3] * table[3];
-	F36 = mult[4] * table[4];
-	F44 = mult[5] * table[5];
-	F52 = mult[6] * table[6];
-	F60 = mult[7] * table[7];
+	F4  = mult[32] * table[0];
+	F12 = mult[33] * table[1];
+	F20 = mult[34] * table[2];
+	F28 = mult[35] * table[3];
+	F36 = mult[36] * table[4];
+	F44 = mult[37] * table[5];
+	F52 = mult[38] * table[6];
+	F60 = mult[39] * table[7];

-	mult += 8;
 	table = &(fftTable[input[5] << 3]);

-	F5 = mult[0] * table[0];
-	F13 = mult[1] * table[1];
-	F21 = mult[2] * table[2];
-	F29 = mult[3] * table[3];
-	F37 = mult[4] * table[4];
-	F45 = mult[5] * table[5];
-	F53 = mult[6] * table[6];
-	F61 = mult[7] * table[7];
+	F5  = mult[40] * table[0];
+	F13 = mult[41] * table[1];
+	F21 = mult[42] * table[2];
+	F29 = mult[43] * table[3];
+	F37 = mult[44] * table[4];
+	F45 = mult[45] * table[5];
+	F53 = mult[46] * table[6];
+	F61 = mult[47] * table[7];

-	mult += 8;
 	table = &(fftTable[input[6] << 3]);

-	F6 = mult[0] * table[0];
-	F14 = mult[1] * table[1];
-	F22 = mult[2] * table[2];
-	F30 = mult[3] * table[3];
-	F38 = mult[4] * table[4];
-	F46 = mult[5] * table[5];
-	F54 = mult[6] * table[6];
-	F62 = mult[7] * table[7];
+	F6  = mult[48] * table[0];
+	F14 = mult[49] * table[1];
+	F22 = mult[50] * table[2];
+	F30 = mult[51] * table[3];
+	F38 = mult[52] * table[4];
+	F46 = mult[53] * table[5];
+	F54 = mult[54] * table[6];
+	F62 = mult[55] * table[7];

-	mult += 8;
 	table = &(fftTable[input[7] << 3]);

-	F7 = mult[0] * table[0];
-	F15 = mult[1] * table[1];
-	F23 = mult[2] * table[2];
-	F31 = mult[3] * table[3];
-	F39 = mult[4] * table[4];
-	F47 = mult[5] * table[5];
-	F55 = mult[6] * table[6];
-	F63 = mult[7] * table[7];
+	F7  = mult[56] * table[0];
+	F15 = mult[57] * table[1];
+	F23 = mult[58] * table[2];
+	F31 = mult[59] * table[3];
+	F39 = mult[60] * table[4];
+	F47 = mult[61] * table[5];
+	F55 = mult[62] * table[6];
+	F63 = mult[63] * table[7];

+   #define ADD_SUB( a, b ) \
+   { \
+      int temp = b; \
+      b = a - b; \
+      a = a + temp; \
+   }
+   
+   #define Q_REDUCE( a ) \
+      ( ( (a) & 0xff ) - ( (a) >> 8 ) )
+   
 /*

   for ( int i = 0; i < 8; i++ )
@@ -800,7 +995,6 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
   }
 */

-
 	// Second loop unrolling:
 	// Iteration 0:
 	ADD_SUB(F0, F1);
@@ -1057,6 +1251,11 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
 	output[47] = Q_REDUCE(F61);
 	output[55] = Q_REDUCE(F62);
 	output[63] = Q_REDUCE(F63);
+
+   #undef ADD_SUB
+   #undef Q_REDUCE
+
+#endif  // AVX2 elif SSE4.1 else
 }

 // Calculates the FFT part of SWIFFT.
@@ -1086,24 +1285,66 @@ void SWIFFTFFT(const unsigned char *input, int m, swift_int32_t *output)
 // - m: the input size divided by 64.
 // - output: will store the result.
 // - a: the coefficients in the sum. Of size 64 * m.
-void SWIFFTSum(const swift_int32_t *input, int m, unsigned char *output, const swift_int16_t *a)
+void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
+                const swift_int16_t *a )
 {
 	int i, j;
-	swift_int32_t result[N];
+	swift_int32_t result[N] __attribute__ ((aligned (64)));
 	register swift_int16_t carry = 0;

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+   __m512i *res = (__m512i*)result;
+   for ( j = 0; j < N/16; ++j )
+   {
+      __m512i sum = _mm512_setzero_si512();
+      const __m512i *f = (__m512i*)input + j;
+      const __m512i *k = (__m512i*)a + j;
+      for ( i = 0; i < m; i++, f += N/16, k += N/16 )
+         sum = _mm512_add_epi32( sum, _mm512_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#elif defined(__AVX2__)
+
+   __m256i *res = (__m256i*)result;
+   for ( j = 0; j < N/8; ++j )
+   {
+      __m256i sum = _mm256_setzero_si256();
+      const __m256i *f = (__m256i*)input + j;
+      const __m256i *k = (__m256i*)a + j;
+      for ( i = 0; i < m; i++, f += N/8, k += N/8 )
+         sum = _mm256_add_epi32( sum, _mm256_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#elif defined(__SSE4_1__)
+
+   __m128i *res = (__m128i*)result;
+   for ( j = 0; j < N/4; ++j )
+   {
+      __m128i sum = _mm_setzero_si128();
+      const __m128i *f = (__m128i*)input + j;
+      const __m128i *k = (__m128i*)a + j;
+      for ( i = 0; i < m; i++, f += N/4, k += N/4 )
+         sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
+      res[j] = sum;
+   }
+
+#else
+
 	for (j = 0; j < N; ++j)
 	{
 		register swift_int32_t sum = 0;
 		const register swift_int32_t *f = input + j;
 		const register swift_int16_t *k = a + j;
-
 		for (i = 0; i < m; i++, f += N,k += N)
 			sum += (*f) * (*k);
-
 		result[j] = sum;
 	}

+#endif
+
 	for (j = 0; j < N; ++j)
 		result[j] = ((FIELD_SIZE << 22) + result[j]) % FIELD_SIZE;

@@ -1122,8 +1363,8 @@ void ComputeSingleSWIFFTX_smooth(unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
 	int i;
 	// Will store the result of the FFT parts:
-	swift_int32_t fftOut[N * M];
-	unsigned char intermediate[N * 3 + 8];
+	swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
+	unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
 	unsigned char carry0,carry1,carry2;

 	// Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets
@@ -1199,8 +1440,8 @@ void ComputeSingleSWIFFTX( unsigned char input[SWIFFTX_INPUT_BLOCK_SIZE],
 {
   int i;
   // Will store the result of the FFT parts:
-   swift_int32_t fftOut[N * M];
-   unsigned char intermediate[N * 3 + 8];
+   swift_int32_t fftOut[N * M] __attribute__ ((aligned (64)));
+   unsigned char intermediate[N * 3 + 8] __attribute__ ((aligned (64)));
   unsigned char carry0,carry1,carry2;

   // Do the three SWIFFTS while remembering the three carry bytes (each carry byte gets