v3.23.0

2025-09-17 23:44:27 +00:00 · 2023-08-30 20:15:48 -04:00
parent 57a6b7b58b
commit 4378d2f841
72 changed files with 10184 additions and 2182 deletions
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -212,7 +212,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   const uint32_t last_nonce = max_nonce - 16;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m512i sixteen = m512_const1_32( 16 );
+   const __m512i sixteen = _mm512_set1_epi32( 16 );

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

@@ -398,7 +398,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;  
   const bool bench = opt_benchmark;
-   const __m256i eight = m256_const1_32( 8 );
+   const __m256i eight = _mm256_set1_epi32( 8 );

   // Prehash first block
   blake256_transform_le( phash, pdata, 512, 0 );
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -203,7 +203,7 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+      *noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
      n += 16;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -345,7 +345,7 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      *noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
      n += 8;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -287,7 +287,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      *noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
      n += 8;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -389,7 +389,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
              submit_solution( work, lane_hash, mythr );
 	      }
      }
-      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
+      *noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   pdata[19] = n;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -103,7 +103,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   const uint32_t last_nonce = max_nonce - 16;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m512i sixteen = m512_const1_32( 16 );
+   const __m512i sixteen = _mm512_set1_epi32( 16 );

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

@@ -213,7 +213,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i eight = m256_const1_32( 8 );
+   const __m256i eight = _mm256_set1_epi32( 8 );

   // Prehash first block
   blake256_transform_le( phash, pdata, 512, 0 );
@@ -328,7 +328,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
           submit_solution( work, lane_hash, mythr );
        }
      }
-      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
+      *noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
      n += 4;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -62,10 +62,10 @@ inline void initState( uint64_t State[/*16*/] )
  state[1] = zero;
  state[2] = zero;
  state[3] = zero;
-  state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
-  state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
-  state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
-  state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
+  state[4] = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state[5] = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
+  state[6] = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state[7] = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );

 #else
    //First 512 bis are zeros
@@ -299,10 +299,10 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
  state1 =
  state2 =
  state3 = m128_zero;
-  state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
-  state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
-  state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
-  state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
+  state4 = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state5 = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
+  state6 = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state7 = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );

  for ( int i = 0; i < nBlocks; i++ )
  { 
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -43,27 +43,29 @@ static const uint64_t blake2b_IV[8] =
  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };

-/*Blake2b's rotation*/
-static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
-    return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
-// serial data is only 32 bytes so AVX2 is the limit for that dimension.
-// However, 2 way parallel looks trivial to code for AVX512 except for
-// a data dependency with rowa.
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define G2W_4X64(a,b,c,d) \
   a = _mm512_add_epi64( a, b ); \
-   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
   c = _mm512_add_epi64( c, d ); \
-   b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
+   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 24 ); \
   a = _mm512_add_epi64( a, b ); \
-   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi64( c, d ); \
-   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
+   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );

+#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s0 = mm512_shufll256_64( s0 ); \
+   s3 = mm512_swap256_128( s3); \
+   s2 = mm512_shuflr256_64( s2 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s0 = mm512_shuflr256_64( s0 ); \
+   s3 = mm512_swap256_128( s3 ); \
+   s2 = mm512_shufll256_64( s2 ); 
+
+/*
 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   G2W_4X64( s0, s1, s2, s3 ); \
   s3 = mm512_shufll256_64( s3 ); \
@@ -73,6 +75,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   s3 = mm512_shuflr256_64( s3 ); \
   s1 = mm512_shufll256_64( s1 ); \
   s2 = mm512_swap256_128( s2 ); 
+*/

 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -88,13 +91,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )

-
 #endif  // AVX512

-#if defined __AVX2__
+#if defined(__AVX2__)

-// process 4 columns in parallel
-// returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
@@ -105,6 +105,18 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );

+// Pivot about s1 instead of s0 reduces latency.
+#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   G_4X64( s0, s1, s2, s3 ); \
+   s0 = mm256_shufll_64( s0 ); \
+   s3 = mm256_swap_128( s3); \
+   s2 = mm256_shuflr_64( s2 ); \
+   G_4X64( s0, s1, s2, s3 ); \
+   s0 = mm256_shuflr_64( s0 ); \
+   s3 = mm256_swap_128( s3 ); \
+   s2 = mm256_shufll_64( s2 );
+
+/*
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
   s3 = mm256_shufll_64( s3 ); \
@@ -114,6 +126,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   s3 = mm256_shuflr_64( s3 ); \
   s1 = mm256_shufll_64( s1 ); \
   s2 = mm256_swap_128( s2 );
+*/

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -182,8 +195,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #endif // AVX2 else SSE2

-// Scalar
-//Blake2b's G function
+/*
+// Scalar, not used.
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+    return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
 #define G(r,i,a,b,c,d) \
  do { \
    a = a + b; \
@@ -196,8 +214,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    b = rotr64(b ^ c, 63); \
  } while(0)

-
-/*One Round of the Blake2b's compression function*/
 #define ROUND_LYRA(r)  \
    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -207,6 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+*/

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)