v3.10.6

2025-09-17 23:44:27 +00:00 · 2019-12-25 01:26:26 -05:00
parent c65b0ff7a6
commit 241bc26767
35 changed files with 3036 additions and 643 deletions
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -10,7 +10,140 @@
 #define LBRY_MIDSTATE    64
 #define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+static __thread sha256_16way_context sha256_16w_mid;
+
+void lbry_16way_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) vhashA[16<<4];
+   uint32_t _ALIGN(64) vhashB[16<<4];
+   uint32_t _ALIGN(64) vhashC[16<<4];
+   uint32_t _ALIGN(64) h0[32];
+   uint32_t _ALIGN(64) h1[32];
+   uint32_t _ALIGN(64) h2[32];
+   uint32_t _ALIGN(64) h3[32];
+   uint32_t _ALIGN(64) h4[32];
+   uint32_t _ALIGN(64) h5[32];
+   uint32_t _ALIGN(64) h6[32];
+   uint32_t _ALIGN(64) h7[32];
+   uint32_t _ALIGN(64) h8[32];
+   uint32_t _ALIGN(64) h9[32];
+   uint32_t _ALIGN(64) h10[32];
+   uint32_t _ALIGN(64) h11[32];
+   uint32_t _ALIGN(64) h12[32];
+   uint32_t _ALIGN(64) h13[32];
+   uint32_t _ALIGN(64) h14[32];
+   uint32_t _ALIGN(64) h15[32];
+   sha256_16way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_8way_context     ctx_sha512;
+   ripemd160_16way_context  ctx_ripemd;
+
+   memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
+   sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   // reinterleave to do sha512 4-way 64 bit twice.
+   dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
+                  h8, h9, h10, h11, h12, h13, h14, h15, vhashA, 256 );
+   intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
+   intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashA, 32 );
+   sha512_8way_close( &ctx_sha512, vhashA );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashB, 32 );
+   sha512_8way_close( &ctx_sha512, vhashB );
+
+   // back to 8-way 32 bit
+   dintrlv_8x64( h0, h1, h2, h3,h4, h5, h6, h7, vhashA, 512 );
+   dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
+   intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
+                         h8, h9, h10, h11, h12, h13, h14, h15, 512 );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashB );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashC );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashB, 20 );
+   sha256_16way_update( &ctx_sha256, vhashC, 20 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, output );
+}
+
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[32*16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<4]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[27];
+   const uint32_t first_nonce = pdata[27];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t edata[32] __attribute__ ((aligned (64)));
+   __m512i  *noncev = (__m512i*)vdata + 27;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   // we need bigendian data...
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
+   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
+   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
+        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
+
+   sha256_16way_init( &sha256_16w_mid );
+   sha256_16way( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+      lbry_16way_hash( hash, vdata );
+
+      for ( int i = 0; i < 16; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_16x32( lane_hash, hash, i, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
+         }
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+
+
+#elif defined(LBRY_8WAY)

 static __thread sha256_8way_context sha256_8w_mid;

@@ -91,11 +224,6 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
   int thr_id = mythr->id;  // thr_id arg is deprecated

-   uint64_t htmax[] = {          0,        0xF,       0xFF,
-                             0xFFF,     0xFFFF, 0x10000000 };
-   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                        0xFFFFF000, 0xFFFF0000,          0 };
-
   // we need bigendian data...
   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -106,33 +234,30 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 1024 );
+                       edata, edata, edata, edata, 1024 );
+
   sha256_8way_init( &sha256_8w_mid );
   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );

-   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
+   do
   {
-      uint32_t mask = masks[m];
-      do
-      {
-        *noncev = mm256_bswap_32( _mm256_set_epi32(
-                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
-         lbry_8way_hash( hash, vdata );
+      *noncev = mm256_bswap_32( _mm256_set_epi32(
+                                       n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
+      lbry_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_8x32( lane_hash, hash, i, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
-            // deinterleave hash for lane
-            extr_lane_8x32( lane_hash, hash, i, 256 );
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-              pdata[27] = n + i;
-              submit_lane_solution( work, lane_hash, mythr, i );
-            }
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
         }
-         n += 8;
-      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
-      break;
-   }
+      }
+      n += 8;
+   } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -98,16 +98,23 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | SHA_OPT;
-#if defined (LBRY_8WAY)
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+#if defined (LBRY_16WAY)
+  gate->scanhash              = (void*)&scanhash_lbry_16way;
+  gate->hash                  = (void*)&lbry_16way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+#elif defined (LBRY_8WAY)
  gate->scanhash              = (void*)&scanhash_lbry_8way;
  gate->hash                  = (void*)&lbry_8way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #elif defined (LBRY_4WAY)
  gate->scanhash              = (void*)&scanhash_lbry_4way;
  gate->hash                  = (void*)&lbry_4way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,11 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+
+// 16 way needs sha256 16 way
+//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//  #define LBRY_16WAY
+#if defined(__AVX2__)
+  #define LBRY_8WAY
+#endif
+/*
 #if !defined(__SHA__)
 #if defined(__AVX2__)
  #define LBRY_8WAY
 #endif
 #endif
+*/

 #define LBRY_NTIME_INDEX 25
 #define LBRY_NBITS_INDEX 26
@@ -18,7 +27,12 @@

 bool register_lbry_algo( algo_gate_t* gate );

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+void lbry_16way_hash( void *state, const void *input );
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(LBRY_8WAY)

 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -80,9 +80,6 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 32 );

-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
 	for (int m=0; m < sizeof(masks); m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -90,23 +87,11 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 				pdata[27] = ++n;
 				be32enc(&endiandata[27], n);
 				lbry_hash(hash64, &endiandata);
-#ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
+               pdata[27] = n;
+               submit_solution( work, hash64, mythr );
 				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
+			} while ( (n < max_nonce -8) && !work_restart[thr_id].restart);
 			break;
 		}
 	}
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -623,3 +623,303 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//  RIPEMD-160 16 way
+
+
+#define F16W_1(x, y, z) \
+   _mm512_xor_si512( _mm512_xor_si512( x, y ), z )
+
+#define F16W_2(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( y, z ), x ), z )
+
+#define F16W_3(x, y, z) \
+   _mm512_xor_si512( _mm512_or_si512( x, mm512_not( y ) ), z )
+
+#define F16W_4(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( x, y ), z ), y )
+
+#define F16W_5(x, y, z) \
+   _mm512_xor_si512( x, _mm512_or_si512( y, mm512_not( z ) ) )
+
+#define RR_16W(a, b, c, d, e, f, s, r, k) \
+do{ \
+   a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
+                _mm512_add_epi32( a, f( b ,c, d ) ), r ), \
+                                 m512_const1_64( k ) ), s ), e ); \
+   c = mm512_rol_32( c, 10 );\
+} while (0)
+
+#define ROUND1_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+static void ripemd160_16way_round( ripemd160_16way_context *sc )
+{
+   const __m512i *in = (__m512i*)sc->buf;
+   __m512i *h  = (__m512i*)sc->val;
+   register __m512i A1, B1, C1, D1, E1;
+   register __m512i A2, B2, C2, D2, E2;
+   __m512i tmp;
+
+   A1 = A2 = h[0];
+   B1 = B2 = h[1];
+   C1 = C2 = h[2];
+   D1 = D2 = h[3];
+   E1 = E2 = h[4];
+
+   ROUND1_16W( A, B, C, D, E, F16W_1, 11, in[ 0], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 14, in[ 1], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1, 15, in[ 2], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 12, in[ 3], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  5, in[ 4], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[ 5], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1,  7, in[ 6], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  9, in[ 7], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 11, in[ 8], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1, 13, in[ 9], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1, 14, in[10], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 15, in[11], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  6, in[12], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1,  7, in[13], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  9, in[14], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[15], 1 );
+
+   ROUND1_16W( E, A, B, C, D, F16W_2,  7, in[ 7], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  6, in[ 4], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2,  8, in[13], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2, 13, in[ 1], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 11, in[10], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2,  9, in[ 6], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  7, in[15], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 15, in[ 3], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[12], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 12, in[ 0], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 15, in[ 9], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  9, in[ 5], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 11, in[ 2], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[14], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 13, in[11], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 12, in[ 8], 2 );
+
+   ROUND1_16W( D, E, A, B, C, F16W_3, 11, in[ 3], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[10], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  6, in[14], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3,  7, in[ 4], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3, 14, in[ 9], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[ 8], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 14, in[ 2], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3, 13, in[ 0], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  5, in[13], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  7, in[ 5], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+
+   ROUND1_16W( C, D, E, A, B, F16W_4, 11, in[ 1], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4, 12, in[ 9], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4, 14, in[11], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4, 15, in[10], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 0], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 15, in[ 8], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  9, in[12], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[ 4], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  9, in[13], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 3], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4,  5, in[ 7], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  6, in[15], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[14], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  6, in[ 5], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4,  5, in[ 6], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 12, in[ 2], 4 );
+
+   ROUND1_16W( B, C, D, E, A, F16W_5,  9, in[ 4], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 15, in[ 0], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5,  5, in[ 5], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5, 11, in[ 9], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  6, in[ 7], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  8, in[12], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 13, in[ 2], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 12, in[10], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  5, in[14], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5, 12, in[ 1], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5, 13, in[ 3], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 14, in[ 8], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 11, in[11], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  8, in[ 6], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  5, in[15], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  6, in[13], 5 );
+
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[ 5], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5,  9, in[14], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  9, in[ 7], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 11, in[ 0], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 13, in[ 9], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5, 15, in[ 2], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 15, in[11], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  5, in[ 4], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5,  7, in[13], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5,  7, in[ 6], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[15], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 11, in[ 8], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5, 14, in[ 1], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 14, in[10], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 12, in[ 3], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  6, in[12], 1 );
+
+   ROUND2_16W( E, A, B, C, D, F16W_4,  9, in[ 6], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4, 13, in[11], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 15, in[ 3], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[ 7], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 12, in[ 0], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4,  8, in[13], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  9, in[ 5], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 11, in[10], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[14], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4,  7, in[15], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 12, in[ 8], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  7, in[12], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4,  6, in[ 4], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4, 15, in[ 9], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 13, in[ 1], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 11, in[ 2], 2 );
+
+   ROUND2_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  7, in[ 5], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 11, in[ 3], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  6, in[14], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 14, in[ 9], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3, 13, in[ 8], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3, 14, in[ 2], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 13, in[10], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 13, in[ 0], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  7, in[ 4], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[13], 3 );
+
+   ROUND2_16W( C, D, E, A, B, F16W_2, 15, in[ 8], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  5, in[ 6], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2,  8, in[ 4], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2, 11, in[ 1], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 14, in[ 3], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 14, in[11], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  6, in[15], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 14, in[ 0], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  6, in[ 5], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2,  9, in[12], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 12, in[ 2], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  9, in[13], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 12, in[ 9], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  5, in[ 7], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 15, in[10], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2,  8, in[14], 4 );
+
+   ROUND2_16W( B, C, D, E, A, F16W_1,  8, in[12], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[15], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 12, in[10], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  9, in[ 4], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 12, in[ 1], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  5, in[ 5], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1, 14, in[ 8], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1,  6, in[ 7], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  8, in[ 6], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 13, in[ 2], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  6, in[13], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[14], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 15, in[ 0], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1, 13, in[ 3], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 11, in[ 9], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1, 11, in[11], 5 );
+
+   tmp =  _mm512_add_epi32( _mm512_add_epi32( h[1], C1 ), D2 );
+   h[1] = _mm512_add_epi32( _mm512_add_epi32( h[2], D1 ), E2 );
+   h[2] = _mm512_add_epi32( _mm512_add_epi32( h[3], E1 ), A2 );
+   h[3] = _mm512_add_epi32( _mm512_add_epi32( h[4], A1 ), B2 );
+   h[4] = _mm512_add_epi32( _mm512_add_epi32( h[0], B1 ), C2 );
+   h[0] = tmp;
+}
+
+void ripemd160_16way_init( ripemd160_16way_context *sc )
+{
+   sc->val[0] = m512_const1_64( 0x6745230167452301 );
+   sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
+   sc->val[3] = m512_const1_64( 0x1032547610325476 );
+   sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
+   sc->count_high = sc->count_low = 0;
+}
+
+void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+                      size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int block_size = 64;
+
+   ptr = (unsigned)sc->count_low & (block_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = block_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == block_size )
+      {
+         ripemd160_16way_round( sc );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
+{
+   unsigned ptr, u;
+   uint32_t low, high;
+   const int block_size = 64;
+   const int pad = block_size - 8;
+
+   ptr = (unsigned)sc->count_low & ( block_size - 1U);
+   sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
+   ptr += 4;
+
+   if ( ptr > pad )
+   {
+       memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
+       ripemd160_16way_round( sc );
+       memset_zero_512( sc->buf, pad>>2 );
+   }
+   else
+       memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+    sc->buf[  pad>>2      ] = _mm512_set1_epi32( low  );
+    sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
+    ripemd160_16way_round( sc );
+    for (u = 0; u < 5; u ++)
+        casti_m512i( dst, u ) = sc->val[u];
+}
+
+#endif  // AVX512
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -32,7 +32,21 @@ void ripemd160_8way_init( ripemd160_8way_context *sc );
 void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
 void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+typedef struct
+{
+   __m512i buf[64>>2];
+   __m512i val[5];
+   uint32_t count_high, count_low;
+} __attribute__ ((aligned (128))) ripemd160_16way_context;
+
+void ripemd160_16way_init( ripemd160_16way_context *sc );
+void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+                      size_t len );
+void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
+
+#endif // AVX512
 #endif // __AVX2__
 #endif // __SSE4_2__
 #endif // RIPEMD_HASH_4WAY_H__