v3.20.2

v3.20.1
2025-09-17 23:44:27 +00:00 · 2022-08-01 20:21:05 -04:00 · 2022-07-26 18:36:40 -04:00
34 changed files with 844 additions and 7120 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -285,11 +285,9 @@ cpuminer_SOURCES = \
  algo/x22/x22i-gate.c \
  algo/x22/x25x.c \
  algo/x22/x25x-4way.c \
-  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower-gate.c \
  algo/yespower/yespower-blake2b.c \
-  algo/yespower/crypto/blake2b-yp.c \
+  algo/yespower/crypto/hmac-blake2b.c \
  algo/yespower/yescrypt-r8g.c \
  algo/yespower/yespower-opt.c

--- a/14
+++ b/14
@@ -65,6 +65,20 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.20.2
+
+Bit rotation optimizations to Blake256, Blake512, Blake2b, Blake2s & Lyra2-blake2b for SSE2 & AVX2.
+Removed old unused yescrypt library and other unused code.
+
+v3.20.1
+
+sph_blake2b optimized 1-way SSSE3 & AVX2.
+Removed duplicate Blake2b used by Power2b algo, will now use optimized sph_blake2b.
+Removed imprecise hash & target display from rejected share log.
+Share and target difficulty is now displayed only for low diificulty shares.
+Updated configure.ac to check for AVX512 asm support.
+Small optimization to Lyra2 SSE2.
+
 v3.20.0

 #375 Fixed segfault in algos using Groestl VAES due to use of uninitialized data.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -371,15 +371,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X22I:         rc = register_x22i_algo          ( gate ); break;
    case ALGO_X25X:         rc = register_x25x_algo          ( gate ); break;
    case ALGO_XEVAN:        rc = register_xevan_algo         ( gate ); break;
-    case ALGO_YESCRYPT:     rc = register_yescrypt_05_algo   ( gate ); break;
-//    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
-    case ALGO_YESCRYPTR8:   rc = register_yescryptr8_05_algo ( gate ); break;
-//    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPT:     rc = register_yescrypt_algo      ( gate ); break;
+    case ALGO_YESCRYPTR8:   rc = register_yescryptr8_algo    ( gate ); break;
    case ALGO_YESCRYPTR8G:  rc = register_yescryptr8g_algo   ( gate ); break;
-    case ALGO_YESCRYPTR16:  rc = register_yescryptr16_05_algo( gate ); break;
-//    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
-    case ALGO_YESCRYPTR32:  rc = register_yescryptr32_05_algo( gate ); break;
-//    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
+    case ALGO_YESCRYPTR16:  rc = register_yescryptr16_algo   ( gate ); break;
+    case ALGO_YESCRYPTR32:  rc = register_yescryptr32_algo   ( gate ); break;
    case ALGO_YESPOWER:     rc = register_yespower_algo      ( gate ); break;
    case ALGO_YESPOWERR16:  rc = register_yespowerr16_algo   ( gate ); break;
    case ALGO_YESPOWER_B2B: rc = register_yespower_b2b_algo  ( gate ); break;
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -400,18 +400,18 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 // Blake-256 4 way

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
-do { \
+{ \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
                      _mm_xor_si128( _mm_set1_epi32( c1 ), m0 ) ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
                      _mm_xor_si128( _mm_set1_epi32( c0 ), m1 ) ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
+   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
-} while (0)
+}

 #if SPH_COMPACT_BLAKE_32

@@ -441,7 +441,8 @@ do { \

 #else

-#define ROUND_S_4WAY(r)   do { \
+#define ROUND_S_4WAY(r) \
+{ \
 	GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
 	GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
 	GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -450,7 +451,7 @@ do { \
 	GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
 	GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
 	GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-} while (0)
+}

 #endif

@@ -537,7 +538,7 @@ do { \

 #if defined(__SSSE3__)

-#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+#define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
   __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                          0x0405060700010203 ); \
@@ -557,11 +558,11 @@ do { \
   MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
   ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
   MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
-} while(0)
+}

 #else  // SSE2

-#define BLAKE256_4WAY_BLOCK_BSWAP32 do \
+#define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
   M0 = mm128_bswap_32( buf[0] ); \
   M1 = mm128_bswap_32( buf[1] ); \
@@ -579,12 +580,12 @@ do { \
   MD = mm128_bswap_32( buf[13] ); \
   ME = mm128_bswap_32( buf[14] ); \
   MF = mm128_bswap_32( buf[15] ); \
-} while(0)
+}

 #endif  // SSSE3 else SSE2

 #define COMPRESS32_4WAY( rounds ) \
-do { \
+{ \
   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -631,7 +632,7 @@ do { \
   H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
   H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
   H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
-} while (0)
+}

 #endif

@@ -642,20 +643,21 @@ do { \
 // Blake-256 8 way

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
-do { \
+{ \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
                         _mm256_xor_si256( _mm256_set1_epi32( c1 ), m0 ) ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
                         _mm256_xor_si256( _mm256_set1_epi32( c0 ), m1 ) ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
+   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
-} while (0)
+}

-#define ROUND_S_8WAY(r)   do { \
+#define ROUND_S_8WAY(r) \
+{ \
        GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
        GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
        GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -664,7 +666,7 @@ do { \
        GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
        GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
        GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-} while (0)
+}

 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
@@ -699,7 +701,7 @@ do { \
 } while (0)

 #define COMPRESS32_8WAY( rounds ) \
-do { \
+{ \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -764,10 +766,10 @@ do { \
   H5 = mm256_xor3( VD, V5, H5 ); \
   H6 = mm256_xor3( VE, V6, H6 ); \
   H7 = mm256_xor3( VF, V7, H7 ); \
-} while (0)
+}

 #define COMPRESS32_8WAY_LE( rounds ) \
-do { \
+{ \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -829,7 +831,7 @@ do { \
   H5 = mm256_xor3( VD, V5, H5 ); \
   H6 = mm256_xor3( VE, V6, H6 ); \
   H7 = mm256_xor3( VF, V7, H7 ); \
-} while (0)
+}

 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
                                       const void *data )
@@ -861,7 +863,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   // G1   
   V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
                         _mm256_xor_si256( _mm256_set1_epi32( CS3 ), M[ 2] ) );
-   V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
+   V[13] = mm256_swap32_16( _mm256_xor_si256( V[13], V[ 1] ) );
   V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
   V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
   V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
@@ -881,7 +883,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   // G7   
   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
                         _mm256_xor_si256( _mm256_set1_epi32( CSF ), M[14] ) );
-   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
+   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
   V[ 3] = _mm256_add_epi32( V[ 3],
                         _mm256_xor_si256( _mm256_set1_epi32( CSE ), M[15] ) );
 }
@@ -935,18 +937,18 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   // G1   
   V1 = _mm256_add_epi32( V1,
                         _mm256_xor_si256( _mm256_set1_epi32( CS2 ), M3 ) );
-   VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
+   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V1 ) );
   V9 = _mm256_add_epi32( V9, VD );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );

   // G4
   V0 = _mm256_add_epi32( V0, V5 );
-   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
+   VF = mm256_swap32_16( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
   V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
                         _mm256_xor_si256( _mm256_set1_epi32( CS8 ), M9 ) ) );
-   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
+   VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );

@@ -954,12 +956,12 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );

   // G6
-   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
+   VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
   V2 = _mm256_add_epi32( _mm256_add_epi32( V2, V7 ),
                         _mm256_xor_si256( _mm256_set1_epi32( CSC ), MD ) );
-   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
+   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );

@@ -967,7 +969,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   V9 = _mm256_add_epi32( V9, VE );
   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
   V3 = _mm256_add_epi32( V3, V4 );
-   VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
+   VE = mm256_shuflr32_8( _mm256_xor_si256( VE, V3 ) );
   V9 = _mm256_add_epi32( V9, VE );
   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );

@@ -1009,7 +1011,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
 // Blake-256 16 way AVX512

 #define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
-do { \
+{ \
   a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
                         _mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
   d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
@@ -1020,9 +1022,10 @@ do { \
   d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
   c = _mm512_add_epi32( c, d ); \
   b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
-} while (0)
+}

-#define ROUND_S_16WAY(r)   do { \
+#define ROUND_S_16WAY(r) \
+{ \
        GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
        GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
        GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
@@ -1031,7 +1034,7 @@ do { \
        GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
        GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
        GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-} while (0)
+}

 #define DECL_STATE32_16WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
@@ -1066,7 +1069,7 @@ do { \
 } while (0)

 #define COMPRESS32_16WAY( rounds ) \
-do { \
+{ \
   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -1133,10 +1136,10 @@ do { \
   H5 = mm512_xor3( VD, V5, H5 ); \
   H6 = mm512_xor3( VE, V6, H6 ); \
   H7 = mm512_xor3( VF, V7, H7 ); \
-} while (0)
+}

 #define COMPRESS32_16WAY_LE( rounds ) \
-do { \
+{ \
   __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -1198,7 +1201,7 @@ do { \
   H5 = mm512_xor3( VD, V5, H5 ); \
   H6 = mm512_xor3( VE, V6, H6 ); \
   H7 = mm512_xor3( VF, V7, H7 ); \
-} while (0)
+}

 // Blake-256 prehash of the second block is split onto 2 parts. The first part
 // is constant for every nonce and only needs to be run once per job. The
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -52,6 +52,180 @@ static const uint8_t sigma[12][16] =
 };


+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define B2B8W_G(a, b, c, d, x, y) \
@@ -214,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
 #define B2B_G(a, b, c, d, x, y) \
 { \
   v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
-	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
+	v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
-	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
+	v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
 	v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
-	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
+	v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
 	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
 }
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -108,11 +108,11 @@ do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm128_ror_32( _mm_xor_si128( d, a ),  8 ); \
+   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ),  7 ); \
 } while(0)
@@ -320,11 +320,11 @@ do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ),  8 ); \
+   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ),  7 ); \
 } while(0)
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -314,10 +314,11 @@ static const sph_u64 CB[16] = {

 // Blake-512 8 way AVX512

-#define GB_8WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+#define GB_8WAY( m0, m1, c0, c1, a, b, c, d ) \
+{ \
   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
                 _mm512_set1_epi64( c1 ), m0 ), b ), a ); \
-   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   d = mm512_swap64_32( _mm512_xor_si512( d, a ) ); \
   c = _mm512_add_epi64( c, d ); \
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
   a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
@@ -325,9 +326,10 @@ static const sph_u64 CB[16] = {
   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi64( c, d ); \
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
-} while (0)
+}

-#define ROUND_B_8WAY(r)   do { \
+#define ROUND_B_8WAY( r ) \
+{ \
   GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
   GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
   GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -336,13 +338,13 @@ static const sph_u64 CB[16] = {
   GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
   GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
   GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-   } while (0)
+}

 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
   uint64_t T0, T1;

-#define COMPRESS64_8WAY( buf )   do \
+#define COMPRESS64_8WAY( buf ) \
 { \
  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -409,7 +411,7 @@ static const sph_u64 CB[16] = {
  H5 = mm512_xor3( VD, V5, H5 ); \
  H6 = mm512_xor3( VE, V6, H6 ); \
  H7 = mm512_xor3( VF, V7, H7 ); \
-} while (0)
+}

 void blake512_8way_compress( blake_8way_big_context *sc )
 { 
@@ -610,7 +612,7 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,

   V0 = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( 
                       _mm512_set1_epi64( CB9 ), sc->buf[ 8] ), V5 ), V0 ); 
-   VF = mm512_ror_64( _mm512_xor_si512( VF, V0 ), 32 ); 
+   VF = mm512_swap64_32( _mm512_xor_si512( VF, V0 ) ); 
   VA = _mm512_add_epi64( VA, VF ); 
   V5 = mm512_ror_64( _mm512_xor_si512( V5, VA ), 25 );
   V0 = _mm512_add_epi64( V0, V5 );
@@ -714,7 +716,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V1 = _mm512_add_epi64( V1, _mm512_xor_si512( _mm512_set1_epi64( c1 ), m0 );

   V1 = _mm512_add_epi64( V1, V5 );   
-   VD = mm512_ror_64( _mm512_xor_si512( VD, V1 ), 32 );
+   VD = mm512_swap64_32( _mm512_xor_si512( VD, V1 ) );
   V9 = _mm512_add_epi64( V9, VD );
   V5 = mm512_ror_64( _mm512_xor_si512( V5, V9 ), 25 );
   V1 = _mm512_add_epi64( V1, _mm512_add_epi64( _mm512_xor_si512(
@@ -728,7 +730,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V2 = _mm512_add_epi64( V2, V6 );
   V2 = _mm512_add_epi64( V2, _mm512_xor_si512( 
                 _mm512_set1_epi64( CBF ), M9 ) );
-   VE = mm512_ror_64( _mm512_xor_si512( VE, V2 ), 32 );
+   VE = mm512_swap64_32( _mm512_xor_si512( VE, V2 ) );
   VA = _mm512_add_epi64( VA, VE );
   V6 = mm512_ror_64( _mm512_xor_si512( V6, VA ), 25 );
   V2 = _mm512_add_epi64( V2, _mm512_add_epi64( _mm512_xor_si512(
@@ -742,7 +744,7 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 //   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512( 
 //                 _mm512_set1_epi64( CBx(1, 7) ), Mx(1, 6) ), V7 ) ); 

-   VF = mm512_ror_64( _mm512_xor_si512( VF, V3 ), 32 ); 
+   VF = mm512_swap64_32( _mm512_xor_si512( VF, V3 ) ); 
   VB = _mm512_add_epi64( VB, VF ); 
   V7 = mm512_ror_64( _mm512_xor_si512( V7, VB ), 25 );
   V3 = _mm512_add_epi64( V3, _mm512_add_epi64( _mm512_xor_si512(
@@ -757,7 +759,6 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
   GB_8WAY(Mx(1, C), Mx(1, D), CBx(1, C), CBx(1, D), V2, V7, V8, VD);
   GB_8WAY(Mx(1, E), Mx(1, F), CBx(1, E), CBx(1, F), V3, V4, V9, VE);

-
   // remaining rounds  
   ROUND_B_8WAY(2);
   ROUND_B_8WAY(3);
@@ -1055,20 +1056,22 @@ blake512_8way_close(void *cc, void *dst)

 // Blake-512 4 way

-#define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
+#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) \
+{ \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                 _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
+   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                 _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
-} while (0)
+}

-#define ROUND_B_4WAY(r)   do { \
+#define ROUND_B_4WAY(r) \
+{ \
 	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
 	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
 	GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
@@ -1077,13 +1080,13 @@ blake512_8way_close(void *cc, void *dst)
 	GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
 	GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
 	GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-	} while (0)
+}

 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
 	uint64_t T0, T1;

-#define COMPRESS64_4WAY   do \
+#define COMPRESS64_4WAY \
 { \
  __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -1148,7 +1151,7 @@ blake512_8way_close(void *cc, void *dst)
  H5 = mm256_xor3( VD, V5, H5 ); \
  H6 = mm256_xor3( VE, V6, H6 ); \
  H7 = mm256_xor3( VF, V7, H7 ); \
-} while (0)
+}


 void blake512_4way_compress( blake_4way_big_context *sc )
@@ -1278,7 +1281,7 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
   // G4 skip nonce
   V0 = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256(
                       _mm256_set1_epi64x( CB9 ), sc->buf[ 8] ), V5 ), V0 );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 32 );
+   VF = mm256_swap64_32( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi64( VA, VF );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 25 );
   V0 = _mm256_add_epi64( V0, V5 );
@@ -1365,7 +1368,7 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
   // finish round 0, with the nonce now available 
   V0 = _mm256_add_epi64( V0, _mm256_xor_si256(
                                       _mm256_set1_epi64x( CB8 ), M9 ) );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V0 ), 16 );
+   VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V0 ) );
   VA = _mm256_add_epi64( VA, VF );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, VA ), 11 );

@@ -1375,34 +1378,34 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,

   // G1
   V1 = _mm256_add_epi64( V1, V5 );
-   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 32 );
+   VD = mm256_swap64_32( _mm256_xor_si256( VD, V1 ) );
   V9 = _mm256_add_epi64( V9, VD );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 25 );
   V1 = _mm256_add_epi64( V1, _mm256_add_epi64( _mm256_xor_si256(
                 _mm256_set1_epi64x( CBx(1,2) ), Mx(1,3) ), V5 ) );
-   VD = mm256_ror_64( _mm256_xor_si256( VD, V1 ), 16 );
+   VD = mm256_shuflr64_16( _mm256_xor_si256( VD, V1 ) );
   V9 = _mm256_add_epi64( V9, VD );
   V5 = mm256_ror_64( _mm256_xor_si256( V5, V9 ), 11 );

   // G2
   V2 = _mm256_add_epi64( V2, _mm256_xor_si256(
                 _mm256_set1_epi64x( CBF ), M9 ) );
-   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 32 );
+   VE = mm256_swap64_32( _mm256_xor_si256( VE, V2 ) );
   VA = _mm256_add_epi64( VA, VE );
   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 25 );
   V2 = _mm256_add_epi64( V2, _mm256_add_epi64( _mm256_xor_si256(
                 _mm256_set1_epi64x( CB9 ), MF ), V6 ) );
-   VE = mm256_ror_64( _mm256_xor_si256( VE, V2 ), 16 );
+   VE = mm256_shuflr64_16( _mm256_xor_si256( VE, V2 ) );
   VA = _mm256_add_epi64( VA, VE );
   V6 = mm256_ror_64( _mm256_xor_si256( V6, VA ), 11 );

   // G3
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 32 );
+   VF = mm256_swap64_32( _mm256_xor_si256( VF, V3 ) );
   VB = _mm256_add_epi64( VB, VF );
   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 25 );
   V3 = _mm256_add_epi64( V3, _mm256_add_epi64( _mm256_xor_si256(
                 _mm256_set1_epi64x( CBx(1, 6) ), Mx(1, 7) ), V7 ) );
-   VF = mm256_ror_64( _mm256_xor_si256( VF, V3 ), 16 );
+   VF = mm256_shuflr64_16( _mm256_xor_si256( VF, V3 ) );
   VB = _mm256_add_epi64( VB, VF );
   V7 = mm256_ror_64( _mm256_xor_si256( V7, VB ), 11 );

--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -30,18 +30,11 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
+#include "simd-utils.h"
 #include "algo/sha/sph_types.h"
 #include "sph_blake2b.h"

-// Cyclic right rotation.
-
-#ifndef ROTR64
-#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-#endif
-
 // Little-endian byte access.
-
 #define B2B_GET64(p)                            \
 	(((uint64_t) ((uint8_t *) (p))[0]) ^        \
 	(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
@@ -52,47 +45,141 @@
 	(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
 	(((uint64_t) ((uint8_t *) (p))[7]) << 56))

-// G Mixing function.
+#if defined(__AVX2__)

-#define B2B_G(a, b, c, d, x, y) {   \
-	v[a] = v[a] + v[b] + x;         \
-	v[d] = ROTR64(v[d] ^ v[a], 32); \
-	v[c] = v[c] + v[d];             \
-	v[b] = ROTR64(v[b] ^ v[c], 24); \
-	v[a] = v[a] + v[b] + y;         \
-	v[d] = ROTR64(v[d] ^ v[a], 16); \
-	v[c] = v[c] + v[d];             \
-	v[b] = ROTR64(v[b] ^ v[c], 63); }
+#define BLAKE2B_G( Sa, Sb, Sc, Sd, Se, Sf, Sg, Sh ) \
+{ \
+  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
+              _mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
+                                 m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+  V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
+  V[2] = _mm256_add_epi64( V[2], V[3] ); \
+  V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
+\
+  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
+              _mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
+                                 m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+  V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
+  V[2] = _mm256_add_epi64( V[2], V[3] ); \
+  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+  __m256i *V = (__m256i*)v; \
+  const uint8_t *sigmaR = sigma[R]; \
+  BLAKE2B_G(  0,  1,  2,  3,  4,  5,  6,  7 ); \
+  V[3] = mm256_shufll_64( V[3] ); \
+  V[2] = mm256_swap_128( V[2] ); \
+  V[1] = mm256_shuflr_64( V[1] ); \
+  BLAKE2B_G(  8,  9, 10, 11, 12, 13, 14, 15 ); \
+  V[3] = mm256_shuflr_64( V[3] ); \
+  V[2] = mm256_swap_128( V[2] ); \
+  V[1] = mm256_shufll_64( V[1] ); \
+}
+
+#elif defined(__SSSE3__)
+
+#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
+{ \
+   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
+                 _mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+   Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
+   Vc = _mm_add_epi64( Vc, Vd ); \
+   Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
+\
+   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
+                 _mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+   Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
+   Vc = _mm_add_epi64( Vc, Vd ); \
+   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   __m128i *V = (__m128i*)v; \
+   __m128i V2, V3, V6, V7; \
+   const uint8_t *sigmaR = sigma[R]; \
+   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
+   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
+   V2 = mm128_shufl2r_64( V[2], V[3] ); \
+   V3 = mm128_shufl2r_64( V[3], V[2] ); \
+   V6 = mm128_shufl2l_64( V[6], V[7] ); \
+   V7 = mm128_shufl2l_64( V[7], V[6] ); \
+   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
+   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
+   V[2] = mm128_shufl2l_64( V2, V3 ); \
+   V[3] = mm128_shufl2l_64( V3, V2 ); \
+   V[6] = mm128_shufl2r_64( V6, V7 ); \
+   V[7] = mm128_shufl2r_64( V7, V6 ); \
+}
+
+#else
+
+#ifndef ROTR64
+#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+#endif
+
+#define BLAKE2B_G( R, Va, Vb, Vc, Vd, Sa, Sb ) \
+{ \
+   Va = Va + Vb + m[ sigma[R][Sa] ]; \
+   Vd = ROTR64( Vd ^ Va, 32 ); \
+   Vc = Vc + Vd; \
+   Vb = ROTR64( Vb ^ Vc, 24 ); \
+\
+   Va = Va + Vb + m[ sigma[R][Sb] ]; \
+   Vd = ROTR64( Vd ^ Va, 16 ); \
+   Vc = Vc + Vd; \
+   Vb = ROTR64( Vb ^ Vc, 63 ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   BLAKE2B_G( R, v[ 0], v[ 4], v[ 8], v[12],  0,  1 ); \
+   BLAKE2B_G( R, v[ 1], v[ 5], v[ 9], v[13],  2,  3 ); \
+   BLAKE2B_G( R, v[ 2], v[ 6], v[10], v[14],  4,  5 ); \
+   BLAKE2B_G( R, v[ 3], v[ 7], v[11], v[15],  6,  7 ); \
+   BLAKE2B_G( R, v[ 0], v[ 5], v[10], v[15],  8,  9 ); \
+   BLAKE2B_G( R, v[ 1], v[ 6], v[11], v[12], 10, 11 ); \
+   BLAKE2B_G( R, v[ 2], v[ 7], v[ 8], v[13], 12, 13 ); \
+   BLAKE2B_G( R, v[ 3], v[ 4], v[ 9], v[14], 14, 15 ); \
+}
+
+#endif

 // Initialization Vector.

-static const uint64_t blake2b_iv[8] = {
+static const uint64_t blake2b_iv[8] __attribute__ ((aligned (32))) =
+{
 	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
 	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
 	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };

+static const uint8_t sigma[12][16] __attribute__ ((aligned (32))) =
+{
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+      { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+      { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+      { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+      { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+      { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+      { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+      { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+      { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+      { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
 // Compression function. "last" flag indicates last block.

 static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 {
-	const uint8_t sigma[12][16] = {
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-	};
-	int i;
-	uint64_t v[16], m[16];
+	uint64_t v[16] __attribute__ ((aligned (32)));
+   uint64_t m[16] __attribute__ ((aligned (32)));
+   int i;

 	for (i = 0; i < 8; i++) {           // init work variables
 		v[i] = ctx->h[i];
@@ -106,16 +193,8 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 	for (i = 0; i < 16; i++)            // get little-endian words
 		m[i] = B2B_GET64(&ctx->b[8 * i]);

-	for (i = 0; i < 12; i++) {          // twelve rounds
-		B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-		B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-		B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-		B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-		B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-		B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-		B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-		B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-	}
+	for (i = 0; i < 12; i++)
+      BLAKE2B_ROUND( i );   

 	for( i = 0; i < 8; ++i )
 		ctx->h[i] ^= v[i] ^ v[i + 8];
--- a/algo/heavy/sph_hefty1.c
+++ b/algo/heavy/sph_hefty1.c
@@ -1,382 +0,0 @@
-/*
- * HEFTY1 cryptographic hash function
- *
- * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those
- * of the authors and should not be interpreted as representing official policies,
- * either expressed or implied, of the FreeBSD Project.
- */
-
-#include <assert.h>
-#include <string.h>
-
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-#include "sph_hefty1.h"
-
-#define Min(A, B) (A <= B ? A : B)
-#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K)                    \
-    {                                                                   \
-        /* To thwart parallelism, Br modifies itself each time it's     \
-         * called.  This also means that calling it in different        \
-         * orders yeilds different results.  In C the order of          \
-         * evaluation of function arguments and + operands are          \
-         * unspecified (and depends on the compiler), so we must make   \
-         * the order of Br calls explicit.                              \
-         */                                                             \
-        uint32_t brG = Br(ctx, G);                                      \
-        uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K;             \
-        uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E));                      \
-        uint32_t brC = Br(ctx, C);                                      \
-        uint32_t brB = Br(ctx, B);                                      \
-        uint32_t tmp3 = Ma(Br(ctx, A), brB, brC);                       \
-        uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A));                      \
-        H = G;                                                          \
-        G = F;                                                          \
-        F = E;                                                          \
-        E = D + Br(ctx, tmp2);                                          \
-        D = C;                                                          \
-        C = B;                                                          \
-        B = A;                                                          \
-        A = tmp2 + tmp4;                                                \
-    }                                                                   \
-
-/* Nothing up my sleeve constants */
-const static uint32_t K[64] = {
-    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
-    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
-    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
-    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
-    0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
-    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
-    0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
-    0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
-    0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
-    0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
-    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
-    0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
-    0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
-    0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
-    0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
-    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
-};
-
-/* Initial hash values */
-const static uint32_t H[HEFTY1_STATE_WORDS] = {
-    0x6a09e667UL,
-    0xbb67ae85UL,
-    0x3c6ef372UL,
-    0xa54ff53aUL,
-    0x510e527fUL,
-    0x9b05688cUL,
-    0x1f83d9abUL,
-    0x5be0cd19UL
-};
-
-static inline uint32_t Rr(uint32_t X, uint8_t n)
-{
-    return (X >> n) | (X << (32 - n));
-}
-
-static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G)
-{
-    return (E & F) ^ (~E & G);
-}
-
-static inline uint32_t Sigma1(uint32_t E)
-{
-    return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25);
-}
-
-static inline uint32_t sigma1(uint32_t X)
-{
-    return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10);
-}
-
-static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C)
-{
-    return (A & B) ^ (A & C) ^ (B & C);
-}
-
-static inline uint32_t Sigma0(uint32_t A)
-{
-    return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22);
-}
-
-static inline uint32_t sigma0(uint32_t X)
-{
-    return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3);
-}
-
-static inline uint32_t Reverse32(uint32_t n)
-{
-    #if BYTE_ORDER == LITTLE_ENDIAN
-        return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24;
-    #else
-        return n;
-    #endif
-}
-
-static inline uint64_t Reverse64(uint64_t n)
-{
-    #if BYTE_ORDER == LITTLE_ENDIAN
-        uint32_t a = n >> 32;
-        uint32_t b = (n << 32) >> 32;
-
-        return (uint64_t)Reverse32(b) << 32 | Reverse32(a);
-    #else
-        return n;
-    #endif
-}
-
-/* Smoosh byte into nibble */
-static inline uint8_t Smoosh4(uint8_t X)
-{
-    return (X >> 4) ^ (X & 0xf);
-}
-
-/* Smoosh 32-bit word into 2-bits */
-static inline uint8_t Smoosh2(uint32_t X)
-{
-    uint16_t w = (X >> 16) ^ (X & 0xffff);
-    uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff));
-    return (n >> 2) ^ (n & 0x3);
-}
-
-static void Mangle(uint32_t *S)
-{
-    uint32_t *R = S;
-    uint32_t *C = &S[1];
-
-    uint8_t r0 = Smoosh4(R[0] >> 24);
-    uint8_t r1 = Smoosh4(R[0] >> 16);
-    uint8_t r2 = Smoosh4(R[0] >> 8);
-    uint8_t r3 = Smoosh4(R[0] & 0xff);
-
-    int i;
-
-    /* Diffuse */
-    uint32_t tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) {
-        uint8_t r = Smoosh2(tmp);
-        switch (r) {
-        case 0:
-            C[i] ^= Rr(R[0], i + r0);
-            break;
-        case 1:
-            C[i] += Rr(~R[0], i + r1);
-            break;
-        case 2:
-            C[i] &= Rr(~R[0], i + r2);
-            break;
-        case 3:
-            C[i] ^= Rr(R[0], i + r3);
-            break;
-        }
-        tmp ^= C[i];
-    }
-
-    /* Compress */
-    tmp = 0;
-    for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++)
-        if (i % 2)
-            tmp ^= C[i];
-        else
-            tmp += C[i];
-    R[0] ^= tmp;
-}
-
-static void Absorb(uint32_t *S, uint32_t X)
-{
-    uint32_t *R = S;
-    R[0] ^= X;
-    Mangle(S);
-}
-
-static uint32_t Squeeze(uint32_t *S)
-{
-    uint32_t Y = S[0];
-    Mangle(S);
-    return Y;
-}
-
-/* Branch, compress and serialize function */
-static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X)
-{
-    uint32_t R = Squeeze(ctx->sponge);
-
-    uint8_t r0 = R >> 8;
-    uint8_t r1 = R & 0xff;
-
-    uint32_t Y = 1 << (r0 % 32);
-
-    switch (r1 % 4)
-    {
-    case 0:
-        /* Do nothing */
-        break;
-    case 1:
-        return X & ~Y;
-    case 2:
-        return X | Y;
-    case 3:
-        return X ^ Y;
-    }
-
-    return X;
-}
-
-static void HashBlock(HEFTY1_CTX *ctx)
-{
-    uint32_t A, B, C, D, E, F, G, H;
-    uint32_t W[HEFTY1_BLOCK_BYTES];
-
-    assert(ctx);
-
-    A = ctx->h[0];
-    B = ctx->h[1];
-    C = ctx->h[2];
-    D = ctx->h[3];
-    E = ctx->h[4];
-    F = ctx->h[5];
-    G = ctx->h[6];
-    H = ctx->h[7];
-
-    int t = 0;
-    for (; t < 16; t++) {
-        W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */
-        Absorb(ctx->sponge, W[t] ^ K[t]);
-    }
-
-    for (t = 0; t < 16; t++) {
-        Absorb(ctx->sponge, D ^ H);
-        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
-    }
-    for (t = 16; t < 64; t++) {
-        Absorb(ctx->sponge, H + D);
-        W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16];
-        RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]);
-    }
-
-    ctx->h[0] += A;
-    ctx->h[1] += B;
-    ctx->h[2] += C;
-    ctx->h[3] += D;
-    ctx->h[4] += E;
-    ctx->h[5] += F;
-    ctx->h[6] += G;
-    ctx->h[7] += H;
-
-    A = 0;
-    B = 0;
-    C = 0;
-    D = 0;
-    E = 0;
-    F = 0;
-    G = 0;
-    H = 0;
-
-    memset(W, 0, sizeof(W));
-}
-
-/* Public interface */
-
-void HEFTY1_Init(HEFTY1_CTX *ctx)
-{
-    assert(ctx);
-
-    memcpy(ctx->h, H, sizeof(ctx->h));
-    memset(ctx->block, 0, sizeof(ctx->block));
-    ctx->written = 0;
-    memset(ctx->sponge, 0, sizeof(ctx->sponge));
-}
-
-void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len)
-{
-    assert(ctx);
-
-    uint64_t read = 0;
-    while (len) {
-        size_t end = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
-        size_t count = Min(len, HEFTY1_BLOCK_BYTES - end);
-        memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count);
-        len -= count;
-        read += count;
-        ctx->written += count;
-        if (!(ctx->written % HEFTY1_BLOCK_BYTES))
-            HashBlock(ctx);
-    }
-}
-
-void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx)
-{
-    assert(digest);
-    assert(ctx);
-
-    /* Pad message (FIPS 180 Section 5.1.1) */
-    size_t used = (size_t)(ctx->written % HEFTY1_BLOCK_BYTES);
-    ctx->block[used++] = 0x80; /* Append 1 to end of message */
-    if (used > HEFTY1_BLOCK_BYTES - 8) {
-        /* We have already written into the last 64bits, so
-         * we must continue into the next block. */
-        memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used);
-        HashBlock(ctx);
-        used = 0; /* Create a new block (below) */
-    }
-
-    /* All remaining bits to zero */
-    memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used);
-
-    /* The last 64bits encode the length (in network byte order) */
-    uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8];
-    *len = Reverse64(ctx->written*8);
-
-    HashBlock(ctx);
-
-    /* Convert back to network byte order */
-    int i = 0;
-    for (; i < HEFTY1_STATE_WORDS; i++)
-        ctx->h[i] = Reverse32(ctx->h[i]);
-
-    memcpy(digest, ctx->h, sizeof(ctx->h));
-    memset(ctx, 0, sizeof(HEFTY1_CTX));
-}
-
-unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest)
-{
-    HEFTY1_CTX ctx;
-    static unsigned char m[HEFTY1_DIGEST_BYTES];
-
-    if (!digest)
-        digest = m;
-
-    HEFTY1_Init(&ctx);
-    HEFTY1_Update(&ctx, buf, len);
-    HEFTY1_Final(digest, &ctx);
-
-    return digest;
-}
--- a/algo/heavy/sph_hefty1.h
+++ b/algo/heavy/sph_hefty1.h
@@ -1,66 +0,0 @@
-/*
- * HEFTY1 cryptographic hash function
- *
- * Copyright (c) 2014, dbcc14 <BM-NBx4AKznJuyem3dArgVY8MGyABpihRy5>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * The views and conclusions contained in the software and documentation are those
- * of the authors and should not be interpreted as representing official policies,
- * either expressed or implied, of the FreeBSD Project.
- */
-
-#ifndef __HEFTY1_H__
-#define __HEFTY1_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef WIN32
-#include <sys/types.h>
-#endif
-
-#include <inttypes.h>
-
-#define HEFTY1_DIGEST_BYTES 32
-#define HEFTY1_BLOCK_BYTES 64
-#define HEFTY1_STATE_WORDS 8
-#define HEFTY1_SPONGE_WORDS 4
-
-typedef struct HEFTY1_CTX {
-    uint32_t h[HEFTY1_STATE_WORDS];
-    uint8_t  block[HEFTY1_BLOCK_BYTES];
-    uint64_t written;
-    uint32_t sponge[HEFTY1_SPONGE_WORDS];
-} HEFTY1_CTX;
-
-void HEFTY1_Init(HEFTY1_CTX *cxt);
-void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len);
-void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt);
-unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __HEFTY1_H__ */
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -97,11 +97,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
+   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
-   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
+   b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );

@@ -137,11 +137,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
   a = _mm_add_epi64( a, b ); \
-   d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
+   d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
+   b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
   a = _mm_add_epi64( a, b ); \
-   d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
   c = _mm_add_epi64( c, d ); \
   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

@@ -150,12 +150,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   G_2X64( s1, s3, s5, s7 ); \
   mm128_vrol256_64( s6, s7 ); \
   mm128_vror256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 ); \
-   G_2X64( s0, s2, s4, s6 ); \
-   G_2X64( s1, s3, s5, s7 ); \
+   G_2X64( s0, s2, s5, s6 ); \
+   G_2X64( s1, s3, s4, s7 ); \
   mm128_vror256_64( s6, s7 ); \
-   mm128_vrol256_64( s2, s3 ); \
-   mm128_swap256_128( s4, s5 );
+   mm128_vrol256_64( s2, s3 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/radiogatun/sph_radiogatun.c
+++ b/algo/radiogatun/sph_radiogatun.c
--- a/algo/radiogatun/sph_radiogatun.h
+++ b/algo/radiogatun/sph_radiogatun.h
@@ -1,186 +0,0 @@
-/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
-/**
- * RadioGatun interface.
- *
- * RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
- * and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
- * presented at the Second Cryptographic Hash Workshop, Santa Barbara,
- * August 24-25, 2006. The main Web site, containing that article, the
- * reference code and some test vectors, appears to be currently located
- * at the following URL: http://radiogatun.noekeon.org/
- *
- * The presentation article does not specify endianness or padding. The
- * reference code uses the following conventions, which we also apply
- * here:
- * <ul>
- * <li>The input message is an integral number of sequences of three
- * words. Each word is either a 32-bit of 64-bit word (depending on
- * the version of RadioGatun).</li>
- * <li>Input bytes are decoded into words using little-endian
- * convention.</li>
- * <li>Padding consists of a single bit of value 1, using little-endian
- * convention within bytes (i.e. for a byte-oriented input, a single
- * byte of value 0x01 is appended), then enough bits of value 0 to finish
- * the current block.</li>
- * <li>Output consists of 256 bits. Successive output words are encoded
- * with little-endian convention.</li>
- * </ul>
- * These conventions are very close to those we use for PANAMA, which is
- * a close ancestor or RadioGatun.
- *
- * RadioGatun is actually a family of functions, depending on some
- * internal parameters. We implement here two functions, with a "belt
- * length" of 13, a "belt width" of 3, and a "mill length" of 19. The
- * RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
- * variant uses 64-bit words.
- *
- * Strictly speaking, the name "RadioGatun" should use an acute accent
- * on the "u", which we omitted here to keep strict ASCII-compatibility
- * of this file.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_radiogatun.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_RADIOGATUN_H__
-#define SPH_RADIOGATUN_H__
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-
-/**
- * Output size (in bits) for RadioGatun[32].
- */
-#define SPH_SIZE_radiogatun32   256
-
-/**
- * This structure is a context for RadioGatun[32] computations: it
- * contains intermediate values and some data from the last entered
- * block. Once a RadioGatun[32] computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running RadioGatun[32]
- * computation can be cloned by copying the context (e.g. with a
- * simple <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char data[156];   /* first field, for alignment */
-	unsigned data_ptr;
-	sph_u32 a[19], b[39];
-#endif
-} sph_radiogatun32_context;
-
-/**
- * Initialize a RadioGatun[32] context. This process performs no
- * memory allocation.
- *
- * @param cc   the RadioGatun[32] context (pointer to a
- *             <code>sph_radiogatun32_context</code>)
- */
-void sph_radiogatun32_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the RadioGatun[32] context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_radiogatun32(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current RadioGatun[32] computation and output the
- * result into the provided buffer. The destination buffer must be wide
- * enough to accomodate the result (32 bytes). The context is
- * automatically reinitialized.
- *
- * @param cc    the RadioGatun[32] context
- * @param dst   the destination buffer
- */
-void sph_radiogatun32_close(void *cc, void *dst);
-
-#if SPH_64
-
-/**
- * Output size (in bits) for RadioGatun[64].
- */
-#define SPH_SIZE_radiogatun64   256
-
-/**
- * This structure is a context for RadioGatun[64] computations: it
- * contains intermediate values and some data from the last entered
- * block. Once a RadioGatun[64] computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running RadioGatun[64]
- * computation can be cloned by copying the context (e.g. with a
- * simple <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char data[312];   /* first field, for alignment */
-	unsigned data_ptr;
-	sph_u64 a[19], b[39];
-#endif
-} sph_radiogatun64_context;
-
-/**
- * Initialize a RadioGatun[64] context. This process performs no
- * memory allocation.
- *
- * @param cc   the RadioGatun[64] context (pointer to a
- *             <code>sph_radiogatun64_context</code>)
- */
-void sph_radiogatun64_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the RadioGatun[64] context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_radiogatun64(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current RadioGatun[64] computation and output the
- * result into the provided buffer. The destination buffer must be wide
- * enough to accomodate the result (32 bytes). The context is
- * automatically reinitialized.
- *
- * @param cc    the RadioGatun[64] context
- * @param dst   the destination buffer
- */
-void sph_radiogatun64_close(void *cc, void *dst);
-
-#endif
-
-#endif
--- a/algo/x20/x20r-gate.c
+++ b/algo/x20/x20r-gate.c
@@ -1,34 +0,0 @@
-#include "x20r-gate.h"
-
-void getAlgoString( const uint8_t* prevblock, char *output )
-{
-    char *sptr = outpuit;
-
-    for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ )
-    {
-        char b = (19 - j) >> 1; // 16 ascii hex chars, reversed
-        uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
-        if (algoDigit >= 10)
-            sprintf(sptr, "%c", 'A' + (algoDigit - 10));
-         else
-            sprintf(sptr, "%u", (uint32_t) algoDigit);
-        sptr++;
-     }
-     *sptr = '\0';
-}
-
-bool register_x20r_algo( algo_gate_t* gate )
-{
-#if defined (X20R_4WAY)
-  gate->scanhash  = (void*)&scanhash_x20r_4way;
-  gate->hash      = (void*)&x20r_4way_hash;
-#else
-  gate->scanhash  = (void*)&scanhash_x20r;
-  gate->hash      = (void*)&x20r_hash;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
-  opt_target_factor = 256.;
-  return true;
-};
-
--- a/algo/x20/x20r-gate.h
+++ b/algo/x20/x20r-gate.h
@@ -1,58 +0,0 @@
-#ifndef X20R_GATE_H__
-#define X20R_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-/*
-#if defined(__AVX2__) && defined(__AES__)
-  #define X20R_4WAY
-#endif
-*/
-
-enum x20r_Algo {
-        BLAKE = 0,
-        BMW,
-        GROESTL,
-        JH,
-        KECCAK,
-        SKEIN,
-        LUFFA,
-        CUBEHASH,
-        SHAVITE,
-        SIMD,
-        ECHO,
-        HAMSI,
-        FUGUE,
-        SHABAL,
-        WHIRLPOOL,
-        SHA_512,
-        HAVAL,      // 256-bits output
-        GOST,
-        RADIOGATUN, // 256-bits output
-        PANAMA,     // 256-bits output
-        X20R_HASH_FUNC_COUNT
-};
-
-void (*x20_r_s_getAlgoString) ( const uint8_t*, char* );
-
-void x20r_getAlgoString( const uint8_t* prevblock, char *output );
-
-bool register_xi20r_algo( algo_gate_t* gate );
-
-#if defined(X20R_4WAY)
-
-void x20r_4way_hash( void *state, const void *input );
-
-int scanhash_x20r_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
-void x20rhash( void *state, const void *input );
-
-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-
--- a/algo/x20/x20r.c
+++ b/algo/x20/x20r.c
@@ -1,252 +0,0 @@
-#include "x20r-gate.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "algo/blake/sph_blake.h"
-#include "algo/bmw/sph_bmw.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/haval/sph-haval.h"
-#include "algo/radiogatun/sph_radiogatun.h"
-#include "algo/panama/sph_panama.h"
-#include "algo/gost/sph_gost.h"
-#include "algo/sha/sph_sha2.h"
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-#else
-  #include "algo/groestl/sph_groestl.h"
-  #include "algo/echo/sph_echo.h"
-#endif 
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
-
-
-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x20r_context_overlay
-{
-    sph_blake512_context     blake;
-    sph_bmw512_context       bmw;
-#if defined(__AES__)
-    hashState_groestl        groestl;
-    hashState_echo           echo;
-#else
-    sph_groestl512_context   groestl;
-    sph_echo512_context      echo;
-#endif
-    sph_skein512_context     skein;
-    sph_jh512_context        jh;
-    sph_keccak512_context    keccak;
-    hashState_luffa          luffa;
-    cubehashParam            cube;
-    hashState_sd             simd;
-    sph_shavite512_context   shavite;
-    sph_hamsi512_context     hamsi;
-    sph_fugue512_context     fugue;
-    sph_shabal512_context    shabal;
-    sph_whirlpool_context    whirlpool;
-    sph_sha512_context       sha512;
-    sph_haval256_5_context   haval;
-    sph_gost512_context      gost;
-    sph_radiogatun64_context radiogatun;
-    sph_panama_context       panama;
-};
-typedef union _x20r_context_overlay x20r_context_overlay;
-
-void x20r_hash(void* output, const void* input)
-{
-   uint32_t _ALIGN(128) hash[64/4];
-   x20r_context_overlay ctx;
-   void *in = (void*) input;
-   int size = 80;
-
-   if ( s_ntime == UINT32_MAX )
-   {
-	const uint8_t* in8 = (uint8_t*) input;
-	x20_r_s_getAlgoString(&in8[4], hashOrder);
-   }
-
-   for (int i = 0; i < 20; i++)
-   {
-	const char elem = hashOrder[i];
-	const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-	switch ( algo )
-       	{
-	   case BLAKE:
-		sph_blake512_init(&ctx.blake);
-		sph_blake512(&ctx.blake, in, size);
-		sph_blake512_close(&ctx.blake, hash);
-		break;
-	   case BMW:
-		sph_bmw512_init(&ctx.bmw);
-		sph_bmw512(&ctx.bmw, in, size);
-		sph_bmw512_close(&ctx.bmw, hash);
-		break;
-	   case GROESTL:
-#if defined(__AES__)
-                init_groestl( &ctx.groestl, 64 );
-                update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                         (const char*)in, size<<3 );
-#else
-                sph_groestl512_init(&ctx.groestl);
-                sph_groestl512(&ctx.groestl, in, size);
-                sph_groestl512_close(&ctx.groestl, hash);
-#endif
-                break;
-           case SKEIN:
-		sph_skein512_init(&ctx.skein);
-		sph_skein512(&ctx.skein, in, size);
-		sph_skein512_close(&ctx.skein, hash);
-		break;
-	   case JH:
-		sph_jh512_init(&ctx.jh);
-		sph_jh512(&ctx.jh, in, size);
-		sph_jh512_close(&ctx.jh, hash);
-		break;
-	   case KECCAK:
-		sph_keccak512_init(&ctx.keccak);
-		sph_keccak512(&ctx.keccak, in, size);
-		sph_keccak512_close(&ctx.keccak, hash);
-		break;
-	   case LUFFA:
-                init_luffa( &ctx.luffa, 512 );
-                update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                        (const BitSequence*)in, size );
-		break;
-           case CUBEHASH:
-                cubehashInit( &ctx.cube, 512, 16, 32 );
-                cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                      (const byte*)in, size );
-		break;
-	   case SHAVITE:
-		sph_shavite512_init(&ctx.shavite);
-		sph_shavite512(&ctx.shavite, in, size);
-		sph_shavite512_close(&ctx.shavite, hash);
-		break;
-           case SIMD:
-                init_sd( &ctx.simd, 512 );
-                update_final_sd( &ctx.simd, (BitSequence *)hash,
-                                 (const BitSequence *)in, size<<3 );
-			break;
-           case ECHO:
-#if defined(__AES__)
-                init_echo( &ctx.echo, 512 );
-                update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                    (const BitSequence *)in, size<<3 );
-#else
-	        sph_echo512_init(&ctx.echo);
-	        sph_echo512(&ctx.echo, in, size);
-	        sph_echo512_close(&ctx.echo, hash);
-#endif
-		break;
-	   case HAMSI:
-		sph_hamsi512_init(&ctx.hamsi);
-		sph_hamsi512(&ctx.hamsi, in, size);
-		sph_hamsi512_close(&ctx.hamsi, hash);
-		break;
-	   case FUGUE:
-		sph_fugue512_init(&ctx.fugue);
-		sph_fugue512(&ctx.fugue, in, size);
-		sph_fugue512_close(&ctx.fugue, hash);
-		break;
-	   case SHABAL:
-		sph_shabal512_init(&ctx.shabal);
-		sph_shabal512(&ctx.shabal, in, size);
-		sph_shabal512_close(&ctx.shabal, hash);
-		break;
-	   case WHIRLPOOL:
-		sph_whirlpool_init(&ctx.whirlpool);
-		sph_whirlpool(&ctx.whirlpool, in, size);
-		sph_whirlpool_close(&ctx.whirlpool, hash);
-		break;
-	   case SHA_512:
-                sph_sha512_Init( &ctx.sha512 );
-                sph_sha512( &ctx.sha512, in, size );
-                sph_sha512_close( &ctx.sha512, hash );
-		break;
-	   case HAVAL:
-		sph_haval256_5_init(&ctx.haval);
-		sph_haval256_5(&ctx.haval, in, size);
-		sph_haval256_5_close(&ctx.haval, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	   case GOST:
-		sph_gost512_init(&ctx.gost);
-		sph_gost512(&ctx.gost, in, size);
-		sph_gost512_close(&ctx.gost, hash);
-		break;
-	   case RADIOGATUN:
-		sph_radiogatun64_init(&ctx.radiogatun);
-		sph_radiogatun64(&ctx.radiogatun, in, size);
-		sph_radiogatun64_close(&ctx.radiogatun, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	   case PANAMA:
-		sph_panama_init(&ctx.panama);
-		sph_panama(&ctx.panama, in, size);
-		sph_panama_close(&ctx.panama, hash);
-		memset(&hash[8], 0, 32);
-		break;
-	}
-   in = (void*) hash;
-   size = 64;
-   }
-   memcpy(output, hash, 32);
-}
-
-int scanhash_x20r( struct work *work, uint32_t max_nonce,
-	           uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-   for (int k=0; k < 19; k++)
-	be32enc( &endiandata[k], pdata[k] );
-
-   if ( s_ntime != pdata[17] )
-   {
-	uint32_t ntime = swab32(pdata[17]);
-	x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
-	s_ntime = ntime;
-	if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
-   }
-
-   if ( opt_benchmark )
-	ptarget[7] = 0x0cff;
-
-   do {
-	be32enc( &endiandata[19], nonce );
-	x20r_hash( hash32, endiandata );
-
-	if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
-  	{
-        pdata[19] = nonce;
-        submit_solution( work, hash32, mythr );
-	}
-	nonce++;
-
-   } while (nonce < max_nonce && !(*restart));
-
-   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
-   return 0;
-}
--- a/algo/yescrypt/yescrypt-best.c
+++ b/algo/yescrypt/yescrypt-best.c
@@ -1,5 +0,0 @@
-#ifdef __SSE2__
-#include "yescrypt-simd.c"
-#else
-#include "yescrypt-opt.c"
-#endif
--- a/algo/yescrypt/yescrypt-platform.h
+++ b/algo/yescrypt/yescrypt-platform.h
@@ -1,213 +0,0 @@
-/*-
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#ifdef MAP_ANON
-#include <sys/mman.h>
-#endif
-
-#include "yescrypt.h"
-#define HUGEPAGE_THRESHOLD		(12 * 1024 * 1024)
-
-#ifdef __x86_64__
-#define HUGEPAGE_SIZE			(2 * 1024 * 1024)
-#else
-#undef HUGEPAGE_SIZE
-#endif
-
-/*
-static __inline uint32_t
-le32dec(const void *pp)
-{
-	const uint8_t *p = (uint8_t const *)pp;
-
-	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
-	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
-}
-
-static __inline void
-le32enc(void *pp, uint32_t x)
-{
-	uint8_t * p = (uint8_t *)pp;
-
-	p[0] = x & 0xff;
-	p[1] = (x >> 8) & 0xff;
-	p[2] = (x >> 16) & 0xff;
-	p[3] = (x >> 24) & 0xff;
-}
-*/
-
-static void *
-alloc_region(yescrypt_region_t * region, size_t size)
-{
-	size_t base_size = size;
-	uint8_t * base, * aligned;
-#ifdef MAP_ANON
-	int flags =
-#ifdef MAP_NOCORE
-	    MAP_NOCORE |
-#endif
-	    MAP_ANON | MAP_PRIVATE;
-#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
-	size_t new_size = size;
-	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
-	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
-		flags |= MAP_HUGETLB;
-/*
- * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
- * huge page size, so let's round up to huge page size here.
- */
-		new_size = size + hugepage_mask;
-		new_size &= ~hugepage_mask;
-	}
-	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
-	if (base != MAP_FAILED) {
-		base_size = new_size;
-	} else
-	if (flags & MAP_HUGETLB) {
-		flags &= ~MAP_HUGETLB;
-		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
-	}
-
-#else
-	base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
-#endif
-	if (base == MAP_FAILED)
-		base = NULL;
-	aligned = base;
-#elif defined(HAVE_POSIX_MEMALIGN)
-	if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
-		base = NULL;
-	aligned = base;
-#else
-	base = aligned = NULL;
-	if (size + 63 < size) {
-		errno = ENOMEM;
-	} else if ((base = malloc(size + 63)) != NULL) {
-		aligned = base + 63;
-		aligned -= (uintptr_t)aligned & 63;
-	}
-#endif
-	region->base = base;
-	region->aligned = aligned;
-	region->base_size = base ? base_size : 0;
-	region->aligned_size = base ? size : 0;
-	return aligned;
-}
-
-static __inline void
-init_region(yescrypt_region_t * region)
-{
-	region->base = region->aligned = NULL;
-	region->base_size = region->aligned_size = 0;
-}
-
-static int
-free_region(yescrypt_region_t * region)
-{
-	if (region->base) {
-#ifdef MAP_ANON
-		if (munmap(region->base, region->base_size))
-			return -1;
-#else
-		free(region->base);
-#endif
-	}
-	init_region(region);
-	return 0;
-}
-
-int yescrypt_init_shared(yescrypt_shared_t * shared, const uint8_t * param, size_t paramlen,
-    uint64_t N, uint32_t r, uint32_t p, yescrypt_init_shared_flags_t flags, uint32_t mask,
-    uint8_t * buf, size_t buflen)
-{
-	yescrypt_shared1_t* shared1 = &shared->shared1;
-	yescrypt_shared_t dummy, half1, half2;
-	uint8_t salt[32];
-
-	if (flags & YESCRYPT_SHARED_PREALLOCATED) {
-		if (!shared1->aligned || !shared1->aligned_size)
-			return -1;
-	} else {
-		init_region(shared1);
-	}
-	shared->mask1 = 1;
-	if (!param && !paramlen && !N && !r && !p && !buf && !buflen)
-		return 0;
-
-	init_region(&dummy.shared1);
-	dummy.mask1 = 1;
-	if (yescrypt_kdf(&dummy, shared1,
-	    param, paramlen, NULL, 0, N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    salt, sizeof(salt), 0 ) )
-		goto out;
-
-	half1 = half2 = *shared;
-	half1.shared1.aligned_size /= 2;
-	half2.shared1.aligned = (void*) ((size_t)half2.shared1.aligned + half1.shared1.aligned_size);
-	half2.shared1.aligned_size = half1.shared1.aligned_size;
-	N /= 2;
-
-	if (p > 1 && yescrypt_kdf(&half1, &half2.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2,
-	    salt, sizeof(salt), 0 ))
-		goto out;
-
-	if (yescrypt_kdf(&half2, &half1.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    salt, sizeof(salt), 0))
-		goto out;
-
-	if (yescrypt_kdf(&half1, &half2.shared1,
-	    param, paramlen, salt, sizeof(salt), N, r, p, 0,
-	    YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1,
-	    buf, buflen, 0))
-		goto out;
-
-	shared->mask1 = mask;
-
-	return 0;
-
-out:
-	if (!(flags & YESCRYPT_SHARED_PREALLOCATED))
-		free_region(shared1);
-	return -1;
-}
-
-int
-yescrypt_free_shared(yescrypt_shared_t * shared)
-{
-	return free_region(&shared->shared1);
-}
-
-int
-yescrypt_init_local(yescrypt_local_t * local)
-{
-	init_region(local);
-	return 0;
-}
-
-int
-yescrypt_free_local(yescrypt_local_t * local)
-{
-	return free_region(local);
-}
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -1,488 +0,0 @@
-/*-
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "compat.h"
-
-#include "yescrypt.h"
-#include "algo/sha/hmac-sha256-hash.h"
-#include "algo-gate-api.h"
-
-#define BYTES2CHARS(bytes) \
-	((((bytes) * 8) + 5) / 6)
-
-#define HASH_SIZE 32 /* bytes */
-#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
-#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
-
-static const char * const itoa64 =
-	"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-static uint8_t* encode64_uint32(uint8_t* dst, size_t dstlen, uint32_t src, uint32_t srcbits)
-{
-	uint32_t bit;
-
-	for (bit = 0; bit < srcbits; bit += 6) {
-		if (dstlen < 1)
-			return NULL;
-		*dst++ = itoa64[src & 0x3f];
-		dstlen--;
-		src >>= 6;
-	}
-
-	return dst;
-}
-
-static uint8_t* encode64(uint8_t* dst, size_t dstlen, const uint8_t* src, size_t srclen)
-{
-	size_t i;
-
-	for (i = 0; i < srclen; ) {
-		uint8_t * dnext;
-		uint32_t value = 0, bits = 0;
-		do {
-			value |= (uint32_t)src[i++] << bits;
-			bits += 8;
-		} while (bits < 24 && i < srclen);
-		dnext = encode64_uint32(dst, dstlen, value, bits);
-		if (!dnext)
-			return NULL;
-		dstlen -= dnext - dst;
-		dst = dnext;
-	}
-
-	return dst;
-}
-
-static int decode64_one(uint32_t* dst, uint8_t src)
-{
-	const char * ptr = strchr(itoa64, src);
-	if (ptr) {
-		*dst = (uint32_t) (ptr - itoa64);
-		return 0;
-	}
-	*dst = 0;
-	return -1;
-}
-
-static const uint8_t* decode64_uint32(uint32_t* dst, uint32_t dstbits, const uint8_t* src)
-{
-	uint32_t bit;
-	uint32_t value;
-
-	value = 0;
-	for (bit = 0; bit < dstbits; bit += 6) {
-		uint32_t one;
-		if (decode64_one(&one, *src)) {
-			*dst = 0;
-			return NULL;
-		}
-		src++;
-		value |= one << bit;
-	}
-
-	*dst = value;
-	return src;
-}
-
-uint8_t* yescrypt_r(const yescrypt_shared_t* shared, yescrypt_local_t* local,
-    const uint8_t* passwd, size_t passwdlen, const uint8_t* setting,
-    uint8_t* buf, size_t buflen, int thrid )
-{
-	uint8_t hash[HASH_SIZE];
-	const uint8_t * src, * salt;
-	uint8_t * dst;
-	size_t prefixlen, saltlen, need;
-	uint8_t version;
-	uint64_t N;
-	uint32_t r, p;
-	yescrypt_flags_t flags = YESCRYPT_WORM;
-
-	printf("pass1 ...");
-	fflush(stdout);
-
-	if (setting[0] != '$' || setting[1] != '7') {
-		printf("died$7 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	printf("died80 ...");
-	fflush(stdout);
-
-	src = setting + 2;
-
-	printf("hello '%p'\n", (char *)src);
-	fflush(stdout);
-
-	switch ((version = *src)) {
-	case '$':
-		printf("died2 ...");
-		fflush(stdout);
-		break;
-	case 'X':
-		src++;
-		flags = YESCRYPT_RW;
-		printf("died3 ...");
-		fflush(stdout);
-		break;
-	default:
-		printf("died4 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	printf("pass2 ...");
-	fflush(stdout);
-
-	if (*src != '$') {
-		uint32_t decoded_flags;
-		if (decode64_one(&decoded_flags, *src)) {
-			printf("died5 ...");
-			fflush(stdout);
-			return NULL;
-		}
-		flags = decoded_flags;
-		if (*++src != '$') {
-			printf("died6 ...");
-			fflush(stdout);
-			return NULL;
-		}
-	}
-
-	src++;
-
-	{
-		uint32_t N_log2;
-		if (decode64_one(&N_log2, *src)) {
-			printf("died7 ...");
-			return NULL;
-		}
-		src++;
-		N = (uint64_t)1 << N_log2;
-	}
-
-	src = decode64_uint32(&r, 30, src);
-	if (!src) {
-		printf("died6 ...");
-		return NULL;
-	}
-
-	src = decode64_uint32(&p, 30, src);
-	if (!src) {
-		printf("died7 ...");
-		return NULL;
-	}
-
-	prefixlen = src - setting;
-
-	salt = src;
-	src = (uint8_t *)strrchr((char *)salt, '$');
-	if (src)
-		saltlen = src - salt;
-	else
-		saltlen = strlen((char *)salt);
-
-	need = prefixlen + saltlen + 1 + HASH_LEN + 1;
-	if (need > buflen || need < saltlen) {
-		printf("'%d %d %d'", (int) need, (int) buflen, (int) saltlen);
-		printf("died8killbuf ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	if ( yescrypt_kdf( shared, local, passwd, passwdlen, salt, saltlen, N, r, p,
-            0, flags, hash, sizeof(hash), thrid ) == -1 )
-   {
-		printf("died10 ...");
-		fflush(stdout);
-		return NULL;
-	}
-
-	dst = buf;
-	memcpy(dst, setting, prefixlen + saltlen);
-	dst += prefixlen + saltlen;
-	*dst++ = '$';
-
-	dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
-	/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
-	 * memory allocations yet anyway. */
-	if (!dst || dst >= buf + buflen) { /* Can't happen */
-		printf("died11 ...");
-		return NULL;
-	}
-
-	*dst = 0; /* NUL termination */
-
-	printf("died12 ...");
-	fflush(stdout);
-
-	return buf;
-}
-
-uint8_t* yescrypt(const uint8_t* passwd, const uint8_t* setting, int thrid )
-{
-	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
-	yescrypt_shared_t shared;
-	yescrypt_local_t local;
-	uint8_t * retval;
-
-	if (yescrypt_init_shared(&shared, NULL, 0,
-	    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
-		return NULL;
-	if (yescrypt_init_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return NULL;
-	}
-	retval = yescrypt_r(&shared, &local,
-	    passwd, 80, setting, buf, sizeof(buf), thrid );
-	//printf("hashse='%s'\n", (char *)retval);
-	if (yescrypt_free_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return NULL;
-	}
-	if (yescrypt_free_shared(&shared))
-		return NULL;
-	return retval;
-}
-
-uint8_t* yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
-    const uint8_t* src, size_t srclen, uint8_t* buf, size_t buflen)
-{
-	uint8_t * dst;
-	size_t prefixlen = 3 + 1 + 5 + 5;
-	size_t saltlen = BYTES2CHARS(srclen);
-	size_t need;
-
-	if (p == 1)
-		flags &= ~YESCRYPT_PARALLEL_SMIX;
-
-	if (flags) {
-		if (flags & ~0x3f)
-			return NULL;
-
-		prefixlen++;
-		if (flags != YESCRYPT_RW)
-			prefixlen++;
-	}
-
-	need = prefixlen + saltlen + 1;
-	if (need > buflen || need < saltlen || saltlen < srclen)
-		return NULL;
-
-	if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
-		return NULL;
-
-	dst = buf;
-	*dst++ = '$';
-	*dst++ = '7';
-	if (flags) {
-		*dst++ = 'X'; /* eXperimental, subject to change */
-		if (flags != YESCRYPT_RW)
-			*dst++ = itoa64[flags];
-	}
-	*dst++ = '$';
-
-	*dst++ = itoa64[N_log2];
-
-	dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
-	if (!dst) /* Can't happen */
-		return NULL;
-
-	dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
-	if (!dst) /* Can't happen */
-		return NULL;
-
-	dst = encode64(dst, buflen - (dst - buf), src, srclen);
-	if (!dst || dst >= buf + buflen) /* Can't happen */
-		return NULL;
-
-	*dst = 0; /* NUL termination */
-
-	return buf;
-}
-
-uint8_t* yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, yescrypt_flags_t flags,
-    const uint8_t * src, size_t srclen)
-{
-	static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
-	return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
-	    buf, sizeof(buf));
-}
-
-static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
-    const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
-    uint8_t * buf, size_t buflen, int thrid )
-{
-	static __thread int initialized = 0;
-	static __thread yescrypt_shared_t shared;
-	static __thread yescrypt_local_t local;
-	int retval;
-	if (!initialized) {
-/* "shared" could in fact be shared, but it's simpler to keep it private
- * along with "local".  It's dummy and tiny anyway. */
-		if (yescrypt_init_shared(&shared, NULL, 0,
-		    0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
-			return -1;
-		if (yescrypt_init_local(&local)) {
-			yescrypt_free_shared(&shared);
-			return -1;
-		}
-		initialized = 1;
-	}
-	retval = yescrypt_kdf(&shared, &local,
-	    passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
-	    buf, buflen, thrid );
-#if 0
-	if (yescrypt_free_local(&local)) {
-		yescrypt_free_shared(&shared);
-		return -1;
-	}
-	if (yescrypt_free_shared(&shared))
-		return -1;
-	initialized = 0;
-#endif
-	return retval;
-}
-
-// scrypt parameters initialized at run time.
-uint64_t YESCRYPT_N;
-uint32_t YESCRYPT_R;
-uint32_t YESCRYPT_P;
-char *yescrypt_client_key = NULL;
-int yescrypt_client_key_len = 0;
-
-/* main hash 80 bytes input */
-int yescrypt_hash( const char *input, char *output, uint32_t len, int thrid )
-{
-   return yescrypt_bsty( (uint8_t*)input, len, (uint8_t*)input, len, YESCRYPT_N,
-                  YESCRYPT_R, YESCRYPT_P, (uint8_t*)output, 32, thrid );
-}
-
-/* for util.c test */
-int yescrypthash(void *output, const void *input, int thrid)
-{
-	return yescrypt_hash((char*) input, (char*) output, 80, thrid);
-}
-
-int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) vhash[8];
-   uint32_t _ALIGN(64) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   uint32_t n = first_nonce;
-   int thr_id = mythr->id; 
-
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-   endiandata[19] = n;
-   do {
-      if ( yescrypt_hash((char*) endiandata, (char*) vhash, 80, thr_id ) )
-      if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
-      {
-          be32enc( pdata+19, n );
-          submit_solution( work, vhash, mythr );
-      }
-      endiandata[19] = ++n;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce;
-   pdata[19] = n;
-   return 0;
-}
-
-void yescrypt_gate_base(algo_gate_t *gate )
-{
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-   gate->scanhash   = (void*)&scanhash_yescrypt;
-   gate->hash       = (void*)&yescrypt_hash;
-   opt_target_factor = 65536.0;
-}
-
-bool register_yescrypt_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-
-   if ( opt_param_n )  YESCRYPT_N = opt_param_n;
-   else                YESCRYPT_N = 2048;
-
-   if ( opt_param_r )  YESCRYPT_R = opt_param_r;
-   else                YESCRYPT_R = 8;
- 
-   if ( opt_param_key ) 
-   {   
-     yescrypt_client_key = opt_param_key;
-     yescrypt_client_key_len = strlen( opt_param_key );
-   }
-   else
-   {   
-     yescrypt_client_key = NULL;
-     yescrypt_client_key_len = 0;
-   }
-
-   YESCRYPT_P = 1;
-
-   applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d", YESCRYPT_N,
-                                                            YESCRYPT_R );
-   if ( yescrypt_client_key )
-     applog( LOG_NOTICE,"Key= \"%s\"\n", yescrypt_client_key );
-
-   return true;
-}
-
-bool register_yescryptr8_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "Client Key";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 2048;
-   YESCRYPT_R = 8;
-   YESCRYPT_P = 1;
-   return true;
-}
-
-bool register_yescryptr16_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "Client Key";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 4096;   
-   YESCRYPT_R = 16;   
-   YESCRYPT_P = 1;   
-   return true;
-}
-
-bool register_yescryptr32_algo( algo_gate_t* gate )
-{
-   yescrypt_gate_base( gate );
-   yescrypt_client_key = "WaviBanana";
-   yescrypt_client_key_len = 10;
-   YESCRYPT_N = 4096;
-   YESCRYPT_R = 32;
-   YESCRYPT_P = 1;
-   return true;
-}
-
--- a/algo/yescrypt/yescrypt.h
+++ b/algo/yescrypt/yescrypt.h
@@ -1,382 +0,0 @@
-/*-
- * Copyright 2009 Colin Percival
- * Copyright 2013,2014 Alexander Peslyak
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#ifndef YESCRYPT_H
-#define YESCRYPT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdlib.h> /* for size_t */
-#include <stdbool.h>
-#include "miner.h"
-
-//#define  __SSE4_1__
-
-int yescrypt_hash(const char* input, char* output, uint32_t len, int thrid );
-
-int yescrypthash(void *output, const void *input, int thrid );
-
-/**
- * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen) and write the result into buf.  The parameters r, p, and buflen
- * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32.  The parameter N
- * must be a power of 2 greater than 1.
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as buf is local to the thread.
- */
-extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __salt, size_t __saltlen,
-    uint64_t __N, uint32_t __r, uint32_t __p,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * Internal type used by the memory allocator.  Please do not use it directly.
- * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
- * they might differ from each other in a future version.
- */
-typedef struct {
-	void * base, * aligned;
-	size_t base_size, aligned_size;
-} yescrypt_region_t;
-
-/**
- * Types for shared (ROM) and thread-local (RAM) data structures.
- */
-typedef yescrypt_region_t yescrypt_shared1_t;
-typedef struct {
-	yescrypt_shared1_t shared1;
-	uint32_t mask1;
-} yescrypt_shared_t;
-typedef yescrypt_region_t yescrypt_local_t;
-
-/**
- * Possible values for yescrypt_init_shared()'s flags argument.
- */
-typedef enum {
-	YESCRYPT_SHARED_DEFAULTS = 0,
-	YESCRYPT_SHARED_PREALLOCATED = 0x100
-} yescrypt_init_shared_flags_t;
-
-/**
- * Possible values for the flags argument of yescrypt_kdf(),
- * yescrypt_gensalt_r(), yescrypt_gensalt().  These may be OR'ed together,
- * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
- * Please refer to the description of yescrypt_kdf() below for the meaning of
- * these flags.
- */
-typedef enum {
-/* public */
-	YESCRYPT_WORM = 0,
-	YESCRYPT_RW = 1,
-	YESCRYPT_PARALLEL_SMIX = 2,
-	YESCRYPT_PWXFORM = 4,
-/* private */
-	__YESCRYPT_INIT_SHARED_1 = 0x10000,
-	__YESCRYPT_INIT_SHARED_2 = 0x20000,
-	__YESCRYPT_INIT_SHARED = 0x30000
-} yescrypt_flags_t;
-
-extern char *yescrypt_client_key;
-extern int yescrypt_client_key_len;
-
-
-#define YESCRYPT_KNOWN_FLAGS \
-	(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
-	__YESCRYPT_INIT_SHARED)
-
-/**
- * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
- *     buf, buflen):
- * Optionally allocate memory for and initialize the shared (ROM) data
- * structure.  The parameters N, r, and p must satisfy the same conditions as
- * with crypto_scrypt().  param and paramlen specify a local parameter with
- * which the ROM is seeded.  If buf is not NULL, then it is used to return
- * buflen bytes of message digest for the initialized ROM (the caller may use
- * this to verify that the ROM has been computed in the same way that it was on
- * a previous run).
- *
- * Return 0 on success; or -1 on error.
- *
- * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
- * ROM is assumed to have been preallocated by the caller, with
- * shared->shared1.aligned being the start address of the ROM and
- * shared->shared1.aligned_size being its size (which must be consistent with
- * N, r, and p).  This may be used e.g. when the ROM is to be placed in a SysV
- * shared memory segment allocated by the caller.
- *
- * mask controls the frequency of ROM accesses by yescrypt_kdf().  Normally it
- * should be set to 1, to interleave RAM and ROM accesses, which works well
- * when both regions reside in the machine's RAM anyway.  Other values may be
- * used e.g. when the ROM is memory-mapped from a disk file.  Recommended mask
- * values are powers of 2 minus 1 or minus 2.  Here's the effect of some mask
- * values:
- * mask	value	ROM accesses in SMix 1st loop	ROM accesses in SMix 2nd loop
- *	0		0				1/2
- *	1		1/2				1/2
- *	2		0				1/4
- *	3		1/4				1/4
- *	6		0				1/8
- *	7		1/8				1/8
- *	14		0				1/16
- *	15		1/16				1/16
- *	1022		0				1/1024
- *	1023		1/1024				1/1024
- *
- * Actual computation of the ROM contents may be avoided, if you don't intend
- * to use a ROM but need a dummy shared structure, by calling this function
- * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
- * arguments starting with param and on.
- *
- * MT-safe as long as shared is local to the thread.
- */
-extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
-    const uint8_t * __param, size_t __paramlen,
-    uint64_t __N, uint32_t __r, uint32_t __p,
-    yescrypt_init_shared_flags_t __flags, uint32_t __mask,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * yescrypt_free_shared(shared):
- * Free memory that had been allocated with yescrypt_init_shared().
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as shared is local to the thread.
- */
-extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
-
-/**
- * yescrypt_init_local(local):
- * Initialize the thread-local (RAM) data structure.  Actual memory allocation
- * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as local is local to the thread.
- */
-extern int yescrypt_init_local(yescrypt_local_t * __local);
-
-/**
- * yescrypt_free_local(local):
- * Free memory that may have been allocated for an initialized thread-local
- * (RAM) data structure.
- *
- * Return 0 on success; or -1 on error.
- *
- * MT-safe as long as local is local to the thread.
- */
-extern int yescrypt_free_local(yescrypt_local_t * __local);
-
-/**
- * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
- *     N, r, p, t, flags, buf, buflen):
- * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
- * p, buflen), or a revision of scrypt as requested by flags and shared, and
- * write the result into buf.  The parameters N, r, p, and buflen must satisfy
- * the same conditions as with crypto_scrypt().  t controls computation time
- * while not affecting peak memory usage.  shared and flags may request
- * special modes as described below.  local is the thread-local data
- * structure, allowing to preserve and reuse a memory allocation across calls,
- * thereby reducing its overhead.
- *
- * Return 0 on success; or -1 on error.
- *
- * t controls computation time.  t = 0 is optimal in terms of achieving the
- * highest area-time for ASIC attackers.  Thus, higher computation time, if
- * affordable, is best achieved by increasing N rather than by increasing t.
- * However, if the higher memory usage (which goes along with higher N) is not
- * affordable, or if fine-tuning of the time is needed (recall that N must be a
- * power of 2), then t = 1 or above may be used to increase time while staying
- * at the same peak memory usage.  t = 1 increases the time by 25% and
- * decreases the normalized area-time to 96% of optimal.  (Of course, in
- * absolute terms the area-time increases with higher t.  It's just that it
- * would increase slightly more with higher N*r rather than with higher t.)
- * t = 2 increases the time by another 20% and decreases the normalized
- * area-time to 89% of optimal.  Thus, these two values are reasonable to use
- * for fine-tuning.  Values of t higher than 2 result in further increase in
- * time while reducing the efficiency much further (e.g., down to around 50% of
- * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
- * numbers varying by the flags settings).
- *
- * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
- * passing a dummy shared structure (see the description of
- * yescrypt_init_shared() above for how to produce one).  In this mode, the
- * thread-local memory region (RAM) is first sequentially written to and then
- * randomly read from.  This algorithm is friendly towards time-memory
- * tradeoffs (TMTO), available both to defenders (albeit not in this
- * implementation) and to attackers.
- *
- * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
- * memory region (RAM), which makes TMTO a lot less efficient.  This may be
- * used to slow down the kinds of attackers who would otherwise benefit from
- * classic scrypt's efficient TMTO.  Since classic scrypt's TMTO allows not
- * only for the tradeoff, but also for a decrease of attacker's area-time (by
- * up to a constant factor), setting YESCRYPT_RW substantially increases the
- * cost of attacks in area-time terms as well.  Yet another benefit of it is
- * that optimal area-time is reached at an earlier time than with classic
- * scrypt, and t = 0 actually corresponds to this earlier completion time,
- * resulting in quicker hash computations (and thus in higher request rate
- * capacity).  Due to these properties, YESCRYPT_RW should almost always be
- * set, except when compatibility with classic scrypt or TMTO-friendliness are
- * desired.
- *
- * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
- * lower level as compared to where it is in classic scrypt.  This reduces
- * flexibility for efficient computation (for both attackers and defenders) by
- * requiring that, short of resorting to TMTO, the full amount of memory be
- * allocated as needed for the specified p, regardless of whether that
- * parallelism is actually being fully made use of or not.  (For comparison, a
- * single instance of classic scrypt may be computed in less memory without any
- * CPU time overhead, but in more real time, by not making full use of the
- * parallelism.)  This may be desirable when the defender has enough memory
- * with sufficiently low latency and high bandwidth for efficient full parallel
- * execution, yet the required memory size is high enough that some likely
- * attackers might end up being forced to choose between using higher latency
- * memory than they could use otherwise (waiting for data longer) or using TMTO
- * (waiting for data more times per one hash computation).  The area-time cost
- * for other kinds of attackers (who would use the same memory type and TMTO
- * factor or no TMTO either way) remains roughly the same, given the same
- * running time for the defender.  In the TMTO-friendly YESCRYPT_WORM mode, as
- * long as the defender has enough memory that is just as fast as the smaller
- * per-thread regions would be, doesn't expect to ever need greater
- * flexibility (except possibly via TMTO), and doesn't need backwards
- * compatibility with classic scrypt, there are no other serious drawbacks to
- * this setting.  In the YESCRYPT_RW mode, which is meant to discourage TMTO,
- * this new approach to parallelization makes TMTO less inefficient.  (This is
- * an unfortunate side-effect of avoiding some random writes, as we have to in
- * order to allow for parallel threads to access a common memory region without
- * synchronization overhead.)  Thus, in this mode this setting poses an extra
- * tradeoff of its own (higher area-time cost for a subset of attackers vs.
- * better TMTO resistance).  Setting YESCRYPT_PARALLEL_SMIX also changes the
- * way the running time is to be controlled from N*r*p (for classic scrypt) to
- * N*r (in this modification).  All of this applies only when p > 1.  For
- * p = 1, this setting is a no-op.
- *
- * Passing a real shared structure, with ROM contents previously computed by
- * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
- * the thread-local RAM region.  In order to allow for initialization of the
- * ROM to be split into a separate program, the shared->shared1.aligned and
- * shared->shared1.aligned_size fields may be set by the caller of
- * yescrypt_kdf() manually rather than with yescrypt_init_shared().
- *
- * local must be initialized with yescrypt_init_local().
- *
- * MT-safe as long as local and buf are local to the thread.
- */
-extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
-    yescrypt_local_t * __local,
-    const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __salt, size_t __saltlen,
-    uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
-    yescrypt_flags_t __flags,
-    uint8_t * __buf, size_t __buflen, int thrid);
-
-/**
- * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
- * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
- * parameters and salt value encoded in setting.  If the shared structure is
- * not dummy, a ROM is used and YESCRYPT_RW is required.  Otherwise, whether to
- * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
- * discouraging modification) is determined by the setting string.  shared and
- * local must be initialized as described above for yescrypt_kdf().  buf must
- * be large enough (as indicated by buflen) to hold the encoded hash string.
- *
- * Return the encoded hash string on success; or NULL on error.
- *
- * MT-safe as long as local and buf are local to the thread.
- */
-extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
-    yescrypt_local_t * __local,
-    const uint8_t * __passwd, size_t __passwdlen,
-    const uint8_t * __setting,
-    uint8_t * __buf, size_t __buflen, int thrid);
-
-/**
- * yescrypt(passwd, setting):
- * Compute and encode an scrypt or enhanced scrypt hash of passwd given the
- * parameters and salt value encoded in setting.  Whether to use the
- * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
- * discouraging modification) is determined by the setting string.
- *
- * Return the encoded hash string on success; or NULL on error.
- *
- * This is a crypt(3)-like interface, which is simpler to use than
- * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
- * and it is slower than yescrypt_r() for repeated calls because it allocates
- * and frees memory on each call.
- *
- * MT-unsafe.
- */
-extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting, int thrid );
-
-/**
- * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
- * Generate a setting string for use with yescrypt_r() and yescrypt() by
- * encoding into it the parameters N_log2 (which is to be set to base 2
- * logarithm of the desired value for N), r, p, flags, and a salt given by src
- * (of srclen bytes).  buf must be large enough (as indicated by buflen) to
- * hold the setting string.
- *
- * Return the setting string on success; or NULL on error.
- *
- * MT-safe as long as buf is local to the thread.
- */
-extern uint8_t * yescrypt_gensalt_r(
-    uint32_t __N_log2, uint32_t __r, uint32_t __p,
-    yescrypt_flags_t __flags,
-    const uint8_t * __src, size_t __srclen,
-    uint8_t * __buf, size_t __buflen);
-
-/**
- * yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
- * Generate a setting string for use with yescrypt_r() and yescrypt().  This
- * function is the same as yescrypt_gensalt_r() except that it uses a static
- * buffer and thus is not MT-safe.
- *
- * Return the setting string on success; or NULL on error.
- *
- * MT-unsafe.
- */
-extern uint8_t * yescrypt_gensalt(
-    uint32_t __N_log2, uint32_t __r, uint32_t __p,
-    yescrypt_flags_t __flags,
-    const uint8_t * __src, size_t __srclen);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/yespower/crypto/blake2b-yp.c
+++ b/algo/yespower/crypto/blake2b-yp.c
@@ -1,323 +0,0 @@
-/*
- * Copyright 2009 Colin Percival, 2014 savale
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *  notice, this list of conditions and the following disclaimer in the
- *  documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include "simd-utils.h"
-#include <algo/yespower/crypto/sph_types.h>
-#include "blake2b-yp.h"
-
-// Cyclic right rotation.
-//#ifndef ROTR64
-//#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
-//#endif
-
-#define ROTR64(x, y) ror64( x, y )
-
-// Little-endian byte access.
-#define B2B_GET64(p)                            \
-    (((uint64_t) ((uint8_t *) (p))[0]) ^        \
-    (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
-    (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
-    (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
-    (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
-    (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
-    (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
-    (((uint64_t) ((uint8_t *) (p))[7]) << 56))
-
-// G Mixing function.
-#define B2B_G(a, b, c, d, x, y) {   \
-    v[a] = v[a] + v[b] + x;      \
-    v[d] = ROTR64(v[d] ^ v[a], 32); \
-    v[c] = v[c] + v[d];          \
-    v[b] = ROTR64(v[b] ^ v[c], 24); \
-    v[a] = v[a] + v[b] + y;      \
-    v[d] = ROTR64(v[d] ^ v[a], 16); \
-    v[c] = v[c] + v[d];          \
-    v[b] = ROTR64(v[b] ^ v[c], 63); }
-
-// Initialization Vector.
-static const uint64_t blake2b_iv[8] = {
-    0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
-    0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
-    0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
-    0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
-};
-
-// Compression function. "last" flag indicates last block.
-static void blake2b_compress(blake2b_yp_ctx *ctx, int last)
-{
-    const uint8_t sigma[12][16] = {
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
-    };
-    int i;
-    uint64_t v[16], m[16];
-
-    // init work variables
-    for (i = 0; i < 8; i++) {
-        v[i] = ctx->h[i];
-        v[i + 8] = blake2b_iv[i];
-    }
-
-    v[12] ^= ctx->t[0]; // low 64 bits of offset
-    v[13] ^= ctx->t[1]; // high 64 bits
-
-    // last block flag set ?
-    if (last) { 
-        v[14] = ~v[14];
-    }
-
-    // get little-endian words
-    for (i = 0; i < 16; i++) {
-        m[i] = B2B_GET64(&ctx->b[8 * i]);
-    }
-
-    // twelve rounds
-    for (i = 0; i < 12; i++) {
-        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
-        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
-        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
-        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
-        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
-        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
-        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
-        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
-    }
-
-    for(i = 0; i < 8; ++i) {
-        ctx->h[i] ^= v[i] ^ v[i + 8];
-    }
-}
-
-// Initialize the hashing context "ctx" with optional key "key".
-// 1 <= outlen <= 64 gives the digest size in bytes.
-// Secret key (also <= 64 bytes) is optional (keylen = 0).
-int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen,
-    const void *key, size_t keylen) // (keylen=0: no key)
-{
-    size_t i;
-
-    // illegal parameters
-    if (outlen == 0 || outlen > 64 || keylen > 64) {
-        return -1;
-    }
-
-    // state, "param block"
-    for (i = 0; i < 8; i++) {
-        ctx->h[i] = blake2b_iv[i];
-    }
-
-    ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
-
-    ctx->t[0] = 0; // input count low word
-    ctx->t[1] = 0; // input count high word
-    ctx->c = 0; // pointer within buffer
-    ctx->outlen = outlen;
-
-    // zero input block
-    for (i = keylen; i < 128; i++) {
-        ctx->b[i] = 0;
-    }
-
-    if (keylen > 0) {
-        blake2b_yp_update(ctx, key, keylen);
-        ctx->c = 128; // at the end
-    }
-
-    return 0;
-}
-
-// Add "inlen" bytes from "in" into the hash.
-void blake2b_yp_update(blake2b_yp_ctx *ctx,
-    const void *in, size_t inlen) // data bytes
-{
-    size_t i;
-    for (i = 0; i < inlen; i++) {
-        if (ctx->c == 128) { // buffer full ?
-            ctx->t[0] += ctx->c; // add counters
-            if (ctx->t[0] < ctx->c) // carry overflow ?
-                ctx->t[1]++; // high word
-            blake2b_compress(ctx, 0); // compress (not last)
-            ctx->c = 0; // counter to zero
-        }
-        ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
-    }
-}
-
-// Generate the message digest (size given in init).
-// Result placed in "out".
-void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out)
-{
-    size_t i;
-
-    ctx->t[0] += ctx->c; // mark last block offset
-    // carry overflow
-    if (ctx->t[0] < ctx->c) {
-        ctx->t[1]++; // high word
-    }
-
-    // fill up with zeros
-    while (ctx->c < 128) {
-        ctx->b[ctx->c++] = 0;
-    }
-
-    blake2b_compress(ctx, 1); // final block flag = 1
-
-    // little endian convert and store
-    for (i = 0; i < ctx->outlen; i++) {
-        ((uint8_t *) out)[i] =
-            (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
-    }
-}
-
-// inlen = number of bytes
-void blake2b_yp_hash(void *out, const void *in, size_t inlen) {
-    blake2b_yp_ctx ctx;
-    blake2b_yp_init(&ctx, 32, NULL, 0);
-    blake2b_yp_update(&ctx, in, inlen);
-    blake2b_yp_final(&ctx, out);
-}
-
-// // keylen = number of bytes
-void hmac_blake2b_yp_init(hmac_yp_ctx *hctx, const void *_key, size_t keylen) {
-    const uint8_t *key = _key;
-    uint8_t keyhash[32];
-    uint8_t pad[64];
-    uint64_t i;
-
-    if (keylen > 64) {
-        blake2b_yp_hash(keyhash, key, keylen);
-        key = keyhash;
-        keylen = 32;
-    }
-
-    blake2b_yp_init(&hctx->inner, 32, NULL, 0);
-    memset(pad, 0x36, 64);
-    for (i = 0; i < keylen; ++i) {
-        pad[i] ^= key[i];
-    }
-
-    blake2b_yp_update(&hctx->inner, pad, 64);
-    blake2b_yp_init(&hctx->outer, 32, NULL, 0);
-    memset(pad, 0x5c, 64);
-    for (i = 0; i < keylen; ++i) {
-        pad[i] ^= key[i];
-    }
-
-    blake2b_yp_update(&hctx->outer, pad, 64);
-    memset(keyhash, 0, 32);
-}
-
-// datalen = number of bits
-void hmac_blake2b_yp_update(hmac_yp_ctx *hctx, const void *data, size_t datalen) {
-    // update the inner state
-    blake2b_yp_update(&hctx->inner, data, datalen);
-}
-
-void hmac_blake2b_yp_final(hmac_yp_ctx *hctx, uint8_t *digest) {
-    uint8_t ihash[32];
-    blake2b_yp_final(&hctx->inner, ihash);
-    blake2b_yp_update(&hctx->outer, ihash, 32);
-    blake2b_yp_final(&hctx->outer, digest);
-    memset(ihash, 0, 32);
-}
-
-// // keylen = number of bytes; inlen = number of bytes
-void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen) {
-    hmac_yp_ctx hctx;
-    hmac_blake2b_yp_init(&hctx, key, keylen);
-    hmac_blake2b_yp_update(&hctx, in, inlen);
-    hmac_blake2b_yp_final(&hctx, out);
-}
-
-void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
-{
-    hmac_yp_ctx PShctx, hctx;
-    size_t i;
-    uint32_t ivec;
-    uint8_t U[32];
-    uint8_t T[32];
-    uint64_t j;
-    int k;
-    size_t clen;
-
-    /* Compute HMAC state after processing P and S. */
-    hmac_blake2b_yp_init(&PShctx, passwd, passwdlen);
-    hmac_blake2b_yp_update(&PShctx, salt, saltlen);
-
-    /* Iterate through the blocks. */
-    for (i = 0; i * 32 < dkLen; i++) {
-        /* Generate INT(i + 1). */
-        ivec = bswap_32( i+1 );
-
-        /* Compute U_1 = PRF(P, S || INT(i)). */
-        memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
-        hmac_blake2b_yp_update(&hctx, &ivec, 4);
-        hmac_blake2b_yp_final(&hctx, U);
-
-        /* T_i = U_1 ... */
-        memcpy(T, U, 32);
-
-        for (j = 2; j <= c; j++) {
-            /* Compute U_j. */
-            hmac_blake2b_yp_init(&hctx, passwd, passwdlen);
-            hmac_blake2b_yp_update(&hctx, U, 32);
-            hmac_blake2b_yp_final(&hctx, U);
-
-            /* ... xor U_j ... */
-            for (k = 0; k < 32; k++) {
-                T[k] ^= U[k];
-            }
-        }
-
-        /* Copy as many bytes as necessary into buf. */
-        clen = dkLen - i * 32;
-        if (clen > 32) {
-            clen = 32;
-        }
-
-        memcpy(&buf[i * 32], T, clen);
-    }
-
-    /* Clean PShctx, since we never called _Final on it. */
-    memset(&PShctx, 0, sizeof(hmac_yp_ctx));
-}
--- a/algo/yespower/crypto/blake2b-yp.h
+++ b/algo/yespower/crypto/blake2b-yp.h
@@ -1,42 +0,0 @@
-#pragma once
-#ifndef __BLAKE2B_H__
-#define __BLAKE2B_H__
-
-#include <stddef.h>
-#include <stdint.h>
-
-#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
-#define NATIVE_LITTLE_ENDIAN
-#endif
-
-// state context
-typedef struct {
-    uint8_t b[128]; // input buffer
-    uint64_t h[8];  // chained state
-    uint64_t t[2];  // total number of bytes
-    size_t c;       // pointer for b[]
-    size_t outlen;  // digest size
-} blake2b_yp_ctx;
-
-typedef struct {
-    blake2b_yp_ctx inner;
-    blake2b_yp_ctx outer;
-} hmac_yp_ctx;
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-int blake2b_yp_init(blake2b_yp_ctx *ctx, size_t outlen, const void *key, size_t keylen);
-void blake2b_yp_update(blake2b_yp_ctx *ctx, const void *in, size_t inlen);
-void blake2b_yp_final(blake2b_yp_ctx *ctx, void *out);
-void blake2b_yp_hash(void *out, const void *in, size_t inlen);
-void hmac_blake2b_yp_hash(void *out, const void *key, size_t keylen, const void *in, size_t inlen);
-void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
-    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/algo/yespower/crypto/hmac-blake2b.c
+++ b/algo/yespower/crypto/hmac-blake2b.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *  notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *  notice, this list of conditions and the following disclaimer in the
+ *  documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "simd-utils.h"
+#include "hmac-blake2b.h"
+
+// keylen = number of bytes
+void hmac_blake2b_init( hmac_blake2b_ctx *hctx, const void *_key,
+                        size_t keylen )
+{
+    const uint8_t *key = _key;
+    uint8_t keyhash[32];
+    uint8_t pad[64];
+    uint64_t i;
+
+    if (keylen > 64)
+    {
+       sph_blake2b_ctx ctx;
+       sph_blake2b_init( &ctx, 32, NULL, 0 );
+       sph_blake2b_update( &ctx, key, keylen );
+       sph_blake2b_final( &ctx, keyhash );
+       key = keyhash;
+       keylen = 32;
+    }
+
+    sph_blake2b_init( &hctx->inner, 32, NULL, 0 );
+    memset( pad, 0x36, 64 );
+    for ( i = 0; i < keylen; ++i )
+        pad[i] ^= key[i];
+
+    sph_blake2b_update( &hctx->inner, pad, 64 );
+    sph_blake2b_init( &hctx->outer, 32, NULL, 0 );
+    memset( pad, 0x5c, 64 );
+    for ( i = 0; i < keylen; ++i )
+        pad[i] ^= key[i];
+
+    sph_blake2b_update( &hctx->outer, pad, 64 );
+    memset( keyhash, 0, 32 );
+}
+
+// datalen = number of bits
+void hmac_blake2b_update( hmac_blake2b_ctx *hctx, const void *data,
+                          size_t datalen )
+{
+    // update the inner state
+    sph_blake2b_update( &hctx->inner, data, datalen );
+}
+
+void hmac_blake2b_final( hmac_blake2b_ctx *hctx, uint8_t *digest )
+{
+    uint8_t ihash[32];
+    sph_blake2b_final( &hctx->inner, ihash );
+    sph_blake2b_update( &hctx->outer, ihash, 32 );
+    sph_blake2b_final( &hctx->outer, digest );
+    memset( ihash, 0, 32 );
+}
+
+// // keylen = number of bytes; inlen = number of bytes
+void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
+                        const void *in, size_t inlen )
+{
+    hmac_blake2b_ctx hctx;
+    hmac_blake2b_init( &hctx, key, keylen );
+    hmac_blake2b_update( &hctx, in, inlen );
+    hmac_blake2b_final( &hctx, out );
+}
+
+void pbkdf2_blake2b( const uint8_t *passwd, size_t passwdlen,
+                     const uint8_t *salt, size_t saltlen, uint64_t c,
+                     uint8_t *buf, size_t dkLen )
+{
+    hmac_blake2b_ctx PShctx, hctx;
+    size_t i;
+    uint32_t ivec;
+    uint8_t U[32];
+    uint8_t T[32];
+    uint64_t j;
+    int k;
+    size_t clen;
+
+    /* Compute HMAC state after processing P and S. */
+    hmac_blake2b_init( &PShctx, passwd, passwdlen );
+    hmac_blake2b_update( &PShctx, salt, saltlen );
+
+    /* Iterate through the blocks. */
+    for ( i = 0; i * 32 < dkLen; i++ )
+    {
+        /* Generate INT(i + 1). */
+        ivec = bswap_32( i+1 );
+
+        /* Compute U_1 = PRF(P, S || INT(i)). */
+        memcpy( &hctx, &PShctx, sizeof(hmac_blake2b_ctx) );
+        hmac_blake2b_update( &hctx, &ivec, 4 );
+        hmac_blake2b_final( &hctx, U );
+
+        /* T_i = U_1 ... */
+        memcpy( T, U, 32 );
+
+        for ( j = 2; j <= c; j++ )
+        {
+            /* Compute U_j. */
+            hmac_blake2b_init( &hctx, passwd, passwdlen );
+            hmac_blake2b_update( &hctx, U, 32 );
+            hmac_blake2b_final( &hctx, U );
+
+            /* ... xor U_j ... */
+            for ( k = 0; k < 32; k++ )
+                T[k] ^= U[k];
+        }
+
+        /* Copy as many bytes as necessary into buf. */
+        clen = dkLen - i * 32;
+        if (clen > 32)
+            clen = 32;
+
+        memcpy( &buf[i * 32], T, clen );
+    }
+
+    /* Clean PShctx, since we never called _Final on it. */
+    memset( &PShctx, 0, sizeof(hmac_blake2b_ctx) );
+}
--- a/algo/yespower/crypto/hmac-blake2b.h
+++ b/algo/yespower/crypto/hmac-blake2b.h
@@ -0,0 +1,34 @@
+#pragma once
+#ifndef __HMAC_BLAKE2B_H__
+#define __HMAC_BLAKE2B_H__
+
+#include <stddef.h>
+#include <stdint.h>
+#include "algo/blake/sph_blake2b.h"
+
+#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+
+typedef struct
+{
+    sph_blake2b_ctx inner;
+    sph_blake2b_ctx outer;
+} hmac_blake2b_ctx;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void hmac_blake2b_hash( void *out, const void *key, size_t keylen,
+                        const void *in, size_t inlen );
+
+void pbkdf2_blake2b( const uint8_t * passwd, size_t passwdlen,
+                     const uint8_t * salt, size_t saltlen, uint64_t c,
+                     uint8_t * buf, size_t dkLen );
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/yespower/crypto/sph_types.h
+++ b/algo/yespower/crypto/sph_types.h
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -95,7 +95,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include "crypto/blake2b-yp.h"
+#include "crypto/hmac-blake2b.h"
 #include "yespower.h"

 #ifdef __unix__
@@ -1136,6 +1136,7 @@ int yespower_b2b(yespower_local_t *local,
    salsa20_blk_t *V, *XY;
    pwxform_ctx_t ctx;
    uint8_t init_hash[32];
+    sph_blake2b_ctx blake2b_ctx;

    /* Sanity-check parameters */
    if ((N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
@@ -1167,7 +1168,9 @@ int yespower_b2b(yespower_local_t *local,
    ctx.S0 = S;
    ctx.S1 = S + Swidth_to_Sbytes1(Swidth);

-    blake2b_yp_hash(init_hash, src, srclen);
+    sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 );
+    sph_blake2b_update( &blake2b_ctx, src, srclen );
+    sph_blake2b_final( &blake2b_ctx, init_hash );

    ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
    ctx.w = 0;
@@ -1181,7 +1184,7 @@ int yespower_b2b(yespower_local_t *local,

    if ( work_restart[thrid].restart ) return false;
    
-    pbkdf2_blake2b_yp(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);
+    pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, B, 128);

    if ( work_restart[thrid].restart ) return false;

@@ -1190,7 +1193,7 @@ int yespower_b2b(yespower_local_t *local,

    if ( work_restart[thrid].restart ) return false;

-    hmac_blake2b_yp_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
+    hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));

    /* Success! */
    return 1;
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -161,7 +161,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )

 // Legacy Yescrypt (yespower v0.5)

-bool register_yescrypt_05_algo( algo_gate_t* gate )
+bool register_yescrypt_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
@@ -194,7 +194,7 @@ bool register_yescrypt_05_algo( algo_gate_t* gate )
 }


-bool register_yescryptr8_05_algo( algo_gate_t* gate )
+bool register_yescryptr8_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
@@ -207,7 +207,7 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate )
   return true;
 }

-bool register_yescryptr16_05_algo( algo_gate_t* gate )
+bool register_yescryptr16_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
@@ -220,7 +220,7 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate )
   return true;
 }

-bool register_yescryptr32_05_algo( algo_gate_t* gate )
+bool register_yescryptr32_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
@@ -249,7 +249,7 @@ bool register_power2b_algo( algo_gate_t* gate )
  applog( LOG_NOTICE,"Key= \"%s\"", yespower_params.pers );
  applog( LOG_NOTICE,"Key length= %d\n", yespower_params.perslen );

-  gate->optimizations = SSE2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->scanhash      = (void*)&scanhash_yespower_b2b;
  gate->hash          = (void*)&yespower_b2b_hash;
  opt_target_factor = 65536.0;
--- a/48
+++ b/48
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.20.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.20.0'
-PACKAGE_STRING='cpuminer-opt 3.20.0'
+PACKAGE_VERSION='3.20.2'
+PACKAGE_STRING='cpuminer-opt 3.20.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.20.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.20.2 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.20.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.20.2:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.20.0
+cpuminer-opt configure 3.20.2
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.20.0, which was
+It was created by cpuminer-opt $as_me 3.20.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.20.0'
+ VERSION='3.20.2'


 cat >>confdefs.h <<_ACEOF
@@ -5820,6 +5820,34 @@ $as_echo "#define USE_AVX2 1" >>confdefs.h

      { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX512 code" >&5
+$as_echo_n "checking whether we can compile AVX512 code... " >&6; }
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define USE_AVX512 1" >>confdefs.h
+
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+        { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX512 instruction set." >&5
+$as_echo "$as_me: WARNING: The assembler does not support the AVX512 instruction set." >&2;}
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext

 else
  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
@@ -6690,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.20.0, which was
+This file was extended by cpuminer-opt $as_me 3.20.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.20.0
+cpuminer-opt config.status 3.20.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.20.0])
+AC_INIT([cpuminer-opt], [3.20.2])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -93,6 +93,14 @@ then
    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
      AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
      AC_MSG_RESULT(yes)
+      AC_MSG_CHECKING(whether we can compile AVX512 code)
+      AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %zmm0, %zmm1, %zmm2{%k1}");])],
+        AC_DEFINE(USE_AVX512, 1, [Define to 1 if AVX512 assembly is available.])
+        AC_MSG_RESULT(yes)
+      ,
+        AC_MSG_RESULT(no)
+        AC_MSG_WARN([The assembler does not support the AVX512 instruction set.])
+      )
    ,
      AC_MSG_RESULT(no)
      AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1300,6 +1300,7 @@ static int share_result( int result, struct work *work,
           my_stats.share_count, acol, ares, scol, sres, rcol, rres, bcol,
           bres, CL_N, share_time, latency );

+/*   
   if ( unlikely( opt_debug || !result || solved ) )
   {
      if ( have_stratum )
@@ -1309,14 +1310,27 @@ static int share_result( int result, struct work *work,
         applog2( LOG_INFO, "Diff %.5g, Block %d",
               my_stats.share_diff, work ? work->height : last_block_height );
   }
+*/

   if ( unlikely( !( opt_quiet || result || stale ) ) )
   {
-      uint32_t str[8];
-      uint32_t *targ;
+//      uint32_t str[8];
+//      uint32_t *targ;

-      if ( reason ) applog( LOG_MINR, "Reject reason: %s", reason );
-         
+      if ( reason ) applog2( LOG_MINR, "Reject reason: %s", reason );
+      {
+         // The exact hash is not avaiable here, it's just an imprecise
+         // approximation calculated from the share difficulty. It's useless
+         // for anything other than low diff rejects. Until and unless a
+         // solution is implemented to make the hash and targets avaiable
+         // don't bother displaying them. In the meantime display the diff for
+         // low diff rejects.
+
+         if ( strstr( reason, "difficulty" ) )
+            applog2( LOG_MINR, "Share diff: %.5g, Target: %.5g",
+                               my_stats.share_diff, my_stats.target_diff );
+
+/*
      diff_to_hash( str, my_stats.share_diff );
      applog2( LOG_INFO, "Hash:   %08x%08x%08x%08x%08x%08x", str[7], str[6],
               str[5], str[4], str[3],str[2], str[1], str[0] );
@@ -1330,6 +1344,8 @@ static int share_result( int result, struct work *work,
      }
      applog2( LOG_INFO, "Target: %08x%08x%08x%08x%08x%08x", targ[7], targ[6],
               targ[5], targ[4], targ[3], targ[2], targ[1], targ[0] );
+*/
+      }
   }
   return 1;
 }
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -273,9 +273,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #endif

 // Mask making
-
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
 // Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm_movmask_64( v ) \
   _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
@@ -306,34 +306,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Bit rotations

-// AVX512VL has implemented bit rotation for 128 bit vectors with
-// 64 and 32 bit elements.
-
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for SSE2, does nothing for AVX512 but is there for
 // transparency.

-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
-// specification but works with a variable. Therefore use rol_var where
-// necessary.
-// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
-
-#define mm128_ror_var_64( v, c ) \
-   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
-
-#define mm128_rol_var_64( v, c ) \
-   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
-
-#define mm128_ror_var_32( v, c ) \
-   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
-
-#define mm128_rol_var_32( v, c ) \
-   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
-
-
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)

 #define mm128_ror_64    _mm_ror_epi64
 #define mm128_rol_64    _mm_rol_epi64
@@ -358,10 +335,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #else  // SSE2

-#define mm128_ror_64   mm128_ror_var_64
-#define mm128_rol_64   mm128_rol_var_64
-#define mm128_ror_32   mm128_ror_var_32
-#define mm128_rol_32   mm128_rol_var_32
+#define mm128_ror_64( v, c ) \
+   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
+
+#define mm128_rol_64( v, c ) \
+   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
+
+#define mm128_ror_32( v, c ) \
+   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
+
+#define mm128_rol_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

 #define mm128_rorx2_64( v1, v0, c ) \
 { \
@@ -411,6 +395,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )

+// Deprecated.
+#define mm128_rol_var_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
+
+//
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from src a, and the high half from src b.
 #define mm128_shuffle2_64( a, b, c ) \
@@ -421,7 +410,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
                                     _mm_castsi128_ps( b ), c ) ); 

-
 //
 // Rotate vector elements accross all lanes

@@ -432,21 +420,61 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )

-
-// Swap 32 bit elements in 64 bit lanes
-#define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_shuflr64_32 mm128_swap64_32
-#define mm128_shufll64_32 mm128_swap64_32
-
 #if defined(__SSSE3__)

 // Rotate right by c bytes, no SSE2 equivalent.
 static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }

+#endif
+
+// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
+// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
+// (unlikely but faster), or when SSSE3 is not available (slower).
+
+#define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_shuflr64_32 mm128_swap64_32
+#define mm128_shufll64_32 mm128_swap64_32
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_24( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#else
+  #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
+#endif
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#else
+  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
+#endif
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_swap32_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#else
+  #define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
+#endif
+#define mm128_shuflr32_16 mm128_swap32_16
+#define mm128_shufll32_16 mm128_swap32_16
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr32_8( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#else
+  #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
+#endif
+
 //
 // Endian byte swap.

+#if defined(__SSSE3__)
+
 #define mm128_bswap_64( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
@@ -537,8 +565,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 //
 // Rotate in place concatenated 128 bit vectors as one 256 bit vector.

-// Swap 128 bit vectorse.
-
+// Swap 128 bit vectors.
+// This should be avoided, it's more efficient to switch references.
 #define mm128_swap256_128( v1, v2 ) \
   v1 = _mm_xor_si128( v1, v2 ); \
   v2 = _mm_xor_si128( v1, v2 ); \
@@ -546,15 +574,14 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )


 // Two input shuffle-rotate.
-// Concatenate v1 & v2 and rotate as one 256 bit vector.
-// Continue to use vror/vrol for now to avoid confusion with
-// shufl2r/shufl2l function macros available with AVX512.
+// Concatenate v1 & v2 and bit rotate as one 256 bit vector.

 #if defined(__SSSE3__)

-// Function macro with two inputs and one output, inputs are preserved.
-// Two input functions are not available without SSSE3. Use procedure
-// macros below instead.
+// Function macros with two inputs and one output, inputs are preserved.
+// Returns the high 128 bits, ie updated v1.
+// These functions are preferred but only available with SSSE3. Use procedure
+// macros below for SSE2 compatibility.

 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
@@ -568,12 +595,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )

-// Procedure macros with 2 inputs and 2 outputs, inputs args are overwritten.
-
-// These macros retain the vrol/vror name for now to avoid
-// confusion with the shufl2r/shuffle2l function macros above.
-// These may be renamed to something like shufl2r2 for 2 nputs and
-// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
+// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
+// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
+// compatibility with with existing code. 

 #define mm128_vror256_64( v1, v2 ) \
 do { \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -13,6 +13,18 @@
 // AVX512 implementations. They will be selected automatically but their use
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
+//
+// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
+// version is not. Some usage has the index vector encoded as if full vector
+// shuffles are supported. This has no side effects and would have the same
+// results using either version.
+// If needed and AVX512 is available, 256 bit full vector shuffles can be
+// implemented using the AVX512 zero-mask feature with a NULL mask.
+// Using intrinsics it's simple:
+//   _mm256_maskz_shuffle_epi8( k0, v, c )
+// With asm it's a bit more complicated with the addition of the mask register
+// and zero tag:
+//   vpshufb ymm0{k0}{z}, ymm1, ymm2 

 #if defined(__AVX__)

@@ -234,9 +246,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #endif

 // Mask making
-
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
 // Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm256_movmask_64( v ) \
   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -273,42 +285,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 //           Bit rotations.
 //
-// The only bit shift for more than 64 bits is with __int128 which is slow.
-//
-// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
-//
 // x2 rotates elements in 2 individual vectors in a double buffered
-// optimization for SSE2, does nothing for AVX512 but is there for
+// optimization for AVX2, does nothing for AVX512 but is here for
 // transparency.

-
-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
-// necessary. 
-
-#define mm256_ror_var_64( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
-                    _mm256_slli_epi64( v, 64-(c) ) )
-
-#define mm256_rol_var_64( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
-                    _mm256_srli_epi64( v, 64-(c) ) )
-
-#define mm256_ror_var_32( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
-                    _mm256_slli_epi32( v, 32-(c) ) )
-
-#define mm256_rol_var_32( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
-                    _mm256_srli_epi32( v, 32-(c) ) )
-
-
-// The spec says both F & VL are required, but just in case AMD
-// decides to implement ROL/R without AVX512F.
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)
-
-// AVX512, control must be 8 bit immediate.

 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -333,10 +314,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 #else   // AVX2

-#define mm256_ror_64    mm256_ror_var_64 
-#define mm256_rol_64    mm256_rol_var_64
-#define mm256_ror_32    mm256_ror_var_32
-#define mm256_rol_32    mm256_rol_var_32
+// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
+
+#define mm256_ror_64( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
+                    _mm256_slli_epi64( v, 64-(c) ) )
+
+#define mm256_rol_64( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
+                    _mm256_srli_epi64( v, 64-(c) ) )
+
+#define mm256_ror_32( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
+                    _mm256_slli_epi32( v, 32-(c) ) )
+
+#define mm256_rol_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )

 #define mm256_rorx2_64( v1, v0, c ) \
 { \
@@ -388,6 +382,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
   _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                    _mm256_srli_epi16( v, 16-(c) ) )

+// Deprecated.
+#define mm256_rol_var_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )

 //
 // Rotate elements accross all lanes.
@@ -399,7 +397,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
-
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

 // Rotate 256 bit vector by one 32 bit element.
@@ -413,7 +410,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 ) )

-       
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

@@ -426,7 +422,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
                                           _mm256_castsi256_ps( b ), c ) ); 

-
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
 #define mm256_shufll128_64 mm256_swap128_64
@@ -437,11 +432,52 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }

-// Swap 32 bit elements in each 64 bit lane.
+// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
+// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
+// AVX512 is available.
+
 #define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_shuflr64_32 mm256_swap64_32
 #define mm256_shufll64_32 mm256_swap64_32

+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
+#else
+  #define mm256_shuflr64_24( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403, \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#endif
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
+#else
+  #define mm256_shuflr64_16( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302, \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#endif
+
+#if defined(__AVX512VL__)
+  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
+#else
+  #define mm256_swap32_16( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302, \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#endif
+#define mm256_shuflr32_16 mm256_swap32_16
+#define mm256_shufll32_16 mm256_swap32_16
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
+#else
+  #define mm256_shuflr32_8( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201, \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#endif
+
 // NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
 // lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
 // AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
@@ -496,18 +532,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
 } while(0)

-//
-// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
-// number of elements. Rotate is done in place, source arguments are
-// overwritten.
-// Some of these can use permute but appears to be slower. Maybe a Ryzen
-// issue
-
-//  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
-//  makes these macros unnecessary.
-
-// continue using vror/vrol notation for now to avoid confusion with
-// shufl2r/shufl2l macro functions available with AVX512.
+// swap 256 bit vectors in place.
+// This should be avoided, it's more efficient to switch references.
 #define mm256_swap512_256( v1, v2 ) \
   v1 = _mm256_xor_si256( v1, v2 ); \
   v2 = _mm256_xor_si256( v1, v2 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -316,58 +316,18 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Bit rotations.

 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
-// elements and can be called directly. But they only accept immediate 8
-// for control arg. 
-// The workaround is a fraud, just a fluke of the compiler's optimizer.
-// It fails without -O3. The compiler seems to unroll shift loops, eliminating
-// the variable control, better than rotate loops. 
+// elements and can be called directly. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
 //

-// For convenience and consistency with AVX2
+// For convenience and consistency with AVX2 macros.
 #define mm512_ror_64 _mm512_ror_epi64
 #define mm512_rol_64 _mm512_rol_epi64
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

-static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi64( v, c ),
-                           _mm512_slli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi64( v, c ),
-                           _mm512_srli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi32( v, c ),
-                           _mm512_slli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi32( v, c ),
-                           _mm512_srli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_ror_16( __m512i const v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi16( v, c ),
-                           _mm512_slli_epi16( v, 16-c ) );
-}
-
-static inline __m512i mm512_rol_16( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi16( v, c ),
-                           _mm512_srli_epi16( v, 16-c ) );
-}
-
 // Rotations using a vector control index are very slow due to overhead
 // to generate the index vector. Repeated rotations using the same index
 // are better handled by the calling function where the index only needs
@@ -599,22 +559,34 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }

-// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
-// but only with AVX512. Shuffle is just as fast and availble with AVX2
-// & SSE2.
+// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
+// can be done with ror & rol. Defined only for convenience and consistency
+// with AVX2 & SSE2 macros.
+
 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
 #define mm512_shuflr64_32 mm512_swap64_32
 #define mm512_shufll64_32 mm512_swap64_32

-// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
-// and 2 input 2 output shuffle macros.
-//
-// shuflr is 1 input
-// shufl2r is 2 input ...
-// Drop macros? They can easilly be rebuilt using shufl2 functions
+#define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
+#define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
+
+#define mm512_shuflr64_16( v )  _mm512_ror_epi64( v, 16 )
+#define mm512_shufll64_16( v )  _mm512_rol_epi64( v, 16 )
+
+#define mm512_shuflr64_8(  v )  _mm512_ror_epi64( v,  8 )
+#define mm512_shufll64_8(  v )  _mm512_rol_epi64( v,  8 )
+
+#define mm512_swap32_16(   v )  _mm512_ror_epi32( v, 16 )
+#define mm512_shuflr32_16 mm512_swap32_16
+#define mm512_shufll32_16 mm512_swap32_16
+
+#define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
+#define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )
+

 // 2 input, 1 output
-// Rotate concatenated { v1, v2 ) right or left and return v1. 
+// Concatenate { v1, v2 ) then rotate right or left and return the high
+// 512 bits, ie rotated v1. 
 #define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
 #define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )
Author	SHA1	Message	Date
Jay D Dee	58030e2788	v3.20.2	2022-08-01 20:21:05 -04:00
Jay D Dee	1321ac474c	v3.20.1	2022-07-26 18:36:40 -04:00