v3.7.4

2026-02-22 16:33:08 +00:00 · 2021-09-29 17:31:16 -04:00
parent 9b905fccc8
commit 2cd1507c2e
80 changed files with 8145 additions and 2097 deletions
--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -69,8 +69,12 @@ typedef unsigned int  uint;
 #define SCRYPT_HASH_BLOCK_SIZE 64U
 #define SCRYPT_HASH_DIGEST_SIZE 32U

-#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
-#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+//#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+//#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+
+#define ROTL32(a,b) rol32(a,b)
+#define ROTR32(a,b) ror32(a,b)
+

 #define U8TO32_BE(p) \
    (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
--- a/algo/scrypt/scrypt-core-4way.h
+++ b/algo/scrypt/scrypt-core-4way.h
@@ -0,0 +1,70 @@
+#ifndef SCRYPT_CORE_4WAY_H__
+#define SCRYPT_CORE_4WAY_H__
+
+#include "simd-utils.h"
+#include <stdlib.h>
+#include <stdint.h>
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
+
+// Serial SIMD over 4 way parallel
+void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
+
+// 4 way parallel over serial SIMD
+void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
+
+#endif
+
+#if defined(__AVX2__)
+
+void scrypt_core_8way( __m256i *X, __m256i *V, uint32_t N );
+
+// 2 way parallel over SIMD128
+void scrypt_core_2way_simd128( __m256i *X, __m256i *V, const uint32_t N );
+
+// Double buffered 2 way parallel over SIMD128
+void scrypt_core_2way_simd128_2buf( __m256i *X, __m256i *V, const uint32_t N );
+
+// Triplee buffered 2 way parallel over SIMD128
+void scrypt_core_2way_simd128_3buf( __m256i *X, __m256i *V, const uint32_t N );
+
+// Serial SIMD128 over 2 way parallel
+void scrypt_core_simd128_2way( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Double buffered simd over parallel
+void scrypt_core_simd128_2way_2buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Triple buffered 2 way
+void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+// Quadruple buffered
+void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N );
+
+#endif
+
+#if defined(__SSE2__)
+
+// Parallel 4 way, 4x memory
+void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
+
+// Linear SIMD 1 way, 1x memory, lowest
+void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Double buffered, 2x memory
+void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Triple buffered
+void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+// Quadruple buffered, 4x memory
+void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
+
+#endif
+
+// For reference only
+void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
+
+#endif   
+
--- a/algo/scrypt/scrypt-core-ref.c
+++ b/algo/scrypt/scrypt-core-ref.c
@@ -0,0 +1,206 @@
+#include "scrypt-core-ref.h"
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+   uint32_t x0 = (B[ 0] ^= C[ 0]),
+            x1 = (B[ 1] ^= C[ 1]),
+            x2 = (B[ 2] ^= C[ 2]),
+            x3 = (B[ 3] ^= C[ 3]);
+   uint32_t x4 = (B[ 4] ^= C[ 4]),
+            x5 = (B[ 5] ^= C[ 5]),
+            x6 = (B[ 6] ^= C[ 6]),
+            x7 = (B[ 7] ^= C[ 7]);
+   uint32_t x8 = (B[ 8] ^= C[ 8]),
+            x9 = (B[ 9] ^= C[ 9]),
+            xa = (B[10] ^= C[10]),
+            xb = (B[11] ^= C[11]);
+   uint32_t xc = (B[12] ^= C[12]),
+            xd = (B[13] ^= C[13]),
+            xe = (B[14] ^= C[14]),
+            xf = (B[15] ^= C[15]);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   /* Operate on columns. */
+   x4 ^= ROTL(x0 + xc,  7);
+   x9 ^= ROTL(x5 + x1,  7);
+   xe ^= ROTL(xa + x6,  7);
+   x3 ^= ROTL(xf + xb,  7);
+   x8 ^= ROTL(x4 + x0,  9);
+   xd ^= ROTL(x9 + x5,  9);
+   x2 ^= ROTL(xe + xa,  9);
+   x7 ^= ROTL(x3 + xf,  9);
+   xc ^= ROTL(x8 + x4, 13);
+   x1 ^= ROTL(xd + x9, 13);
+   x6 ^= ROTL(x2 + xe, 13);
+   xb ^= ROTL(x7 + x3, 13);
+   x0 ^= ROTL(xc + x8, 18);
+   x5 ^= ROTL(x1 + xd, 18);
+   xa ^= ROTL(x6 + x2, 18);
+   xf ^= ROTL(xb + x7, 18);
+
+   /* Operate on rows. */
+   x1 ^= ROTL(x0 + x3,  7);
+   x6 ^= ROTL(x5 + x4,  7);
+   xb ^= ROTL(xa + x9,  7);
+   xc ^= ROTL(xf + xe,  7);
+   x2 ^= ROTL(x1 + x0,  9);
+   x7 ^= ROTL(x6 + x5,  9);
+   x8 ^= ROTL(xb + xa,  9);
+   xd ^= ROTL(xc + xf,  9);
+   x3 ^= ROTL(x2 + x1, 13);
+   x4 ^= ROTL(x7 + x6, 13);
+   x9 ^= ROTL(x8 + xb, 13);
+   xe ^= ROTL(xd + xc, 13);
+   x0 ^= ROTL(x3 + x2, 18);
+   x5 ^= ROTL(x4 + x7, 18);
+   xa ^= ROTL(x9 + x8, 18);
+   xf ^= ROTL(xe + xd, 18);
+
+   B[ 0] += x0;
+   B[ 1] += x1;
+   B[ 2] += x2;
+   B[ 3] += x3;
+   B[ 4] += x4;
+   B[ 5] += x5;
+   B[ 6] += x6;
+   B[ 7] += x7;
+   B[ 8] += x8;
+   B[ 9] += x9;
+   B[10] += xa;
+   B[11] += xb;
+   B[12] += xc;
+   B[13] += xd;
+   B[14] += xe;
+   B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor (def. 1024)
+ */
+void scrypt_core_ref(uint32_t *X, uint32_t *V, uint32_t N)
+{
+   for (uint32_t i = 0; i < N; i++) {
+      memcpy(&V[i * 32], X, 128);
+      xor_salsa8(&X[0], &X[16]);
+      xor_salsa8(&X[16], &X[0]);
+   }
+   for (uint32_t i = 0; i < N; i++) {
+      uint32_t j = 32 * (X[16] & (N - 1));
+      for (uint8_t k = 0; k < 32; k++)
+         X[k] ^= V[j + k];
+      xor_salsa8(&X[0], &X[16]);
+      xor_salsa8(&X[16], &X[0]);
+   }
+}
+
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c