v26.1

v25.7
2026-02-23 08:53:08 +00:00 · 2026-01-13 19:17:47 -05:00 · 2025-11-15 10:44:32 -05:00
27 changed files with 569 additions and 488 deletions
--- a/14
+++ b/14
@@ -32,8 +32,6 @@ Requirements

 32 bit CPUs are not supported.

-Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
-
 Mining on mobile devices that meet the requirements is not recommended due to the risk of
 overheating and damaging the battery. Mining has unlimited demand, it will push any device
 to or beyond its limits. There is also a fire risk with overheated lithium batteries.
@@ -75,6 +73,18 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v26.1
+
+Fixed segfault in scrypt algo on some older CPUs.
+
+v25.7
+
+Fixed a bug calculating TTF longer than 1 year.
+Faster argon2d.
+Faster hamsi AVX512.
+Faster switfftx AVX2.
+Other small fixes and improvements.
+
 v25.6

 Added argon2d1000, argon2d16000 algos.
--- a/algo/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2d/blake2/blamka-round-opt.h
@@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #if defined(__SSSE3__)  || defined(__ARM_NEON)

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B1, B0, 8);                               \
-        v128_t t1 = v128_alignr8(B0, B1, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D1, D0, 8);                                       \
-        t1 = v128_alignr8(D0, D1, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+   v128_t t = v128_alignr8( B1, B0, 8 ); \
+   B1 = v128_alignr8( B0, B1, 8 ); \
+   B0 = t; \
+   t = v128_alignr8( D1, D0, 8 ); \
+   D0 = v128_alignr8( D0, D1, 8 ); \
+   D1 = t; \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B0, B1, 8);                               \
-        v128_t t1 = v128_alignr8(B1, B0, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D0, D1, 8);                                       \
-        t1 = v128_alignr8(D1, D0, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = v128_alignr8( B0, B1, 8 ); \
+    B1 = v128_alignr8( B1, B0, 8 ); \
+    B0 = t; \
+    t = v128_alignr8( D0, D1, 8 ); \
+    D0 = v128_alignr8( D1, D0, 8 ); \
+    D1 = t; \
+}

 #else /* SSE2 */

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = D0;                                                       \
-        v128_t t1 = B0;                                                       \
-        D0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = D0;                                                               \
-        D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0));               \
-        D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1));               \
-        B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1));               \
-        B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = D0; \
+    D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
+    D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
+    t = B0; \
+    B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
+    B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
+}
+
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = B0; \
+    B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
+    B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
+    t = D0; \
+    D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
+    D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0, t1;                                                        \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-        t0 = B0;                                                               \
-        t1 = D0;                                                               \
-        B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0));               \
-        B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1));               \
-        D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1));               \
-        D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
 #endif

-#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
-    do {                                                                       \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
-                                                                               \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
-    } while ((void)0, 0)
+#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
+{ \
+    G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+}
+
 #else /* __AVX2__ */

 #include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
-        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
-        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
-        C1 = tmp1; \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
    } while(0);

 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do { \
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        C1 = tmp1; \
        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
    } while((void)0, 0);

 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
-        \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
    } while((void)0, 0);

@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
-        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
+        G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    } while((void)0, 0);

@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #include <immintrin.h>

+/*
 static inline __m512i muladd(__m512i x, __m512i y)
 {
    __m512i z = _mm512_mul_epu32(x, y);
    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
 }
+*/

+#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 32); \
+  D1 = _mm512_ror_epi64(D1, 32); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 24); \
+  B1 = _mm512_ror_epi64(B1, 24); \
+}
+
+#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 16); \
+  D1 = _mm512_ror_epi64(D1, 16); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 63); \
+  B1 = _mm512_ror_epi64(B1, 63); \
+}
+
+/*
 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 24); \
        B1 = _mm512_ror_epi64(B1, 24); \
    } while ((void)0, 0)
-
+*/
+/* 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 63); \
        B1 = _mm512_ror_epi64(B1, 63); \
    } while ((void)0, 0)
+*/

 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
    } while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
    } while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
    } while ((void)0, 0)

+static const __m512i swap_q0  = { 0,1,  8,9,  2,3,  10,11 }; 
+static const __m512i swap_q1  = { 4,5, 12,13, 6,7,  14,15 };
+static const __m512i uswap_q0 = { 0,1,  4,5,  8,9,  12,13 };
+static const __m512i uswap_q1 = { 2,3,  6,7, 10,11, 14,15 };
+
 #define SWAP_HALVES(A0, A1) \
    do { \
        __m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
        A0 = t; \
    } while((void)0, 0)

+#define SWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
+   A0 = t; \
+}   
+
+#define UNSWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
+   A0 = t; \
+}   
+   
+/*
 #define SWAP_QUARTERS(A0, A1) \
    do { \
        SWAP_HALVES(A0, A1); \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
    } while((void)0, 0)
-
+*/
+/*
 #define UNSWAP_QUARTERS(A0, A1) \
    do { \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
        SWAP_HALVES(A0, A1); \
    } while((void)0, 0)
+*/

 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
    do { \
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   mj[14] = mm256_rol_64( M[14], 15 );
   mj[15] = mm256_rol_64( M[15], 16 );

-   __m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
-   const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
+   __m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
+   static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
                        __m512i dH[16] )
 {
   __m512i qt[32], xl, xh;
-   __m512i mh[16];
+   __m512i mh[16], mj[16];
   int i;

   for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );

-   __m512i mj[16];
- 
   mj[ 0] = mm512_rol_64( M[ 0],  1 );
   mj[ 1] = mm512_rol_64( M[ 1],  2 );
   mj[ 2] = mm512_rol_64( M[ 2],  3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   mj[14] = mm512_rol_64( M[14], 15 );
   mj[15] = mm512_rol_64( M[15], 16 );

-   __m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
-   const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
+   __m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
+   static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm512_add_epi64( K, Kincr );
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -503,32 +503,28 @@ do { \
  SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
  SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
  s4 = mm512_swap64_32( s4 ); \
-  s5 = mm512_swap64_32( s5 ); \
+  t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
  sD = mm512_swap64_32( sD ); \
-  sE = mm512_swap64_32( sE ); \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
  L8( s0, t0, s9, t1 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
+  t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
  L8( s1, t2, sA, t3 ); \
  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
-  s7 = mm512_swap64_32( s7 ); \
-  sC = mm512_swap64_32( sC ); \
-  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
+  t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
  L8( s2, t4, sB, t5 ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
+  t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
  L8( s3, t2, s8, t3 ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
+  t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
+  t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
  t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
  t3 = mm512_swap64_32( t3 ); \
  L8( t0, t1, t2, t3 ); \
-  t3 = mm512_swap64_32( t3 ); \
  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
-  s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
+  s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
-  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
+  s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
-  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
-  sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
+  s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
+  sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1142,7 +1137,7 @@ do { \
  } \
 } while (0)

-// v3 ternary logic, 8 instructions, 2 local vars
+// v4 ternary logic, 8 instructions, 2 local vars
 #define SBOX( a, b, c, d ) \
 { \
  __m256i tb, td; \
@@ -1268,7 +1263,7 @@ do { \
 } while (0)
 #endif

-// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
+// v3, 15 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
 #endif

 /*
-/ v2, 16 instructions, 10 TL equivalent instructions
+/ v2, 16 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i t = mm256_xorand( d, a, c ); \
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
    __m512i t = a0; \
    a0 = mm512_xoror( a3, a0, a1 ); \
    a2 = _mm512_xor_si512( a2, a3 ); \
-    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm512_xorand( a2, a3, t ); \
    a2 = mm512_xorand( a1, a2, a0); \
    a1 = _mm512_or_si512( a1, a3 ); \
    a3 = _mm512_xor_si512( a3, a2 ); \
    t  = _mm512_xor_si512( t, a1 ); \
    a2 = _mm512_and_si512( a2, a1 ); \
-    a1 = mm512_xnor( a1, a0 ); \
+    a1 = mm512_nxor( a1, a0 ); \
    a0 = t; \
 }

@@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m256i t = a0; \
    a0 = mm256_xoror( a3, a0, a1 ); \
    a2 = _mm256_xor_si256( a2, a3 ); \
-    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm256_xorand( a2, a3, t ); \
    a2 = mm256_xorand( a1, a2, a0); \
    a1 = _mm256_or_si256( a1, a3 ); \
    a3 = _mm256_xor_si256( a3, a2 ); \
    t  = _mm256_xor_si256( t, a1 ); \
    a2 = _mm256_and_si256( a2, a1 ); \
-    a1 = mm256_xnor( a1, a0 ); \
+    a1 = mm256_nxor( a1, a0 ); \
    a0 = t; \
 }

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -69,18 +69,18 @@
    v128_t t = a0; \
    a0 = v128_xoror( a3, a0, a1 ); \
    a2 = v128_xor( a2, a3 ); \
-    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
    a3 = v128_xorand( a2, a3, t ); \
    a2 = v128_xorand( a1, a2, a0 ); \
    a1 = v128_or( a1, a3 ); \
    a3 = v128_xor( a3, a2 ); \
    t  = v128_xor( t, a1 ); \
    a2 = v128_and( a2, a1 ); \
-    a1 = v128_xnor( a1, a0 ); \
+    a1 = v128_nxor( a1, a0 ); \
    a0 = t; \
 }

-#else
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -37,8 +37,8 @@

 #if defined(SIMD512)
  #define SCRYPT_THROUGHPUT 16
-#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-  #define SCRYPT_THROUGHPUT 2
+//#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
+//  #define SCRYPT_THROUGHPUT 2
 #elif defined(__AVX2__)
  #define SCRYPT_THROUGHPUT 8
 #elif defined(__SSE2__) || defined(__ARM_NEON)
@@ -162,7 +162,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
 }

 #endif    // throughput 1
-          //
+          
 #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)

 static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, 
@@ -1230,7 +1230,8 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input,

 #if ( SCRYPT_THROUGHPUT == 4 )

-#if defined(__SHA__)
+#if 0
+//#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)

 static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
           uint32_t *midstate, int N, int thrid )
@@ -1244,6 +1245,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
    memcpy( tstate+16, midstate, 32 );
    memcpy( tstate+24, midstate, 32 );
    
+    HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
+                                  ostate, ostate+8 );
+    PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
+                                   input, input+20,  W, W+32 );
+    HMAC_SHA256_80_init_SHA_2BUF( input+40, input+60, tstate+16, tstate+24,
+                                  ostate+16, ostate+24 );
+    PBKDF2_SHA256_80_128_SHA_2BUF( tstate+16, tstate+24, ostate+16, ostate+24,
+                                   input+40, input+60,  W+64, W+96 );
+/*    
    HMAC_SHA256_80_init(  input,     tstate,    ostate    );
    PBKDF2_SHA256_80_128( tstate,    ostate,    input,     W );
    HMAC_SHA256_80_init(  input +20, tstate+ 8, ostate+ 8 );
@@ -1252,7 +1262,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
    PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 );
    HMAC_SHA256_80_init(  input +60, tstate+24, ostate+24 );
    PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 );
-
+*/
 /*    
   // Working Linear single threaded SIMD
   scrypt_core_simd128( W,    V, N );
@@ -1278,11 +1288,16 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,

   if ( work_restart[thrid].restart ) return 0;

+   PBKDF2_SHA256_128_32_SHA_2BUF( tstate,    tstate+ 8, ostate,    ostate+ 8,
+                                  W,    W+32, output,    output+ 8 );
+   PBKDF2_SHA256_128_32_SHA_2BUF( tstate+16, tstate+24, ostate+16, ostate+24,
+                                  W+64, W+96, output+16, output+24 );
+/*   
   PBKDF2_SHA256_128_32( tstate,    ostate,    W,    output    );
   PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 );
   PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 );
   PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 );
-
+*/
   return 1;
 }

@@ -1390,13 +1405,13 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
         rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
                                     thr_id );
 #elif ( SCRYPT_THROUGHPUT == 4 )
-  #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
-                                         thr_id );
-  #else
+//  #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
+//         rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
+//                                         thr_id );
+//  #else
         rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
                                     thr_id );
-  #endif
+//  #endif
 #elif ( SCRYPT_THROUGHPUT == 2 ) && ( defined(__SHA__) || defined(__ARM_FEATURE_SHA2) )
         rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
                                         thr_id );
@@ -1444,11 +1459,6 @@ bool scrypt_miner_thread_init( int thr_id )

 bool register_scrypt_algo( algo_gate_t* gate )
 {
-#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
-   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | SHA256_OPT | NEON_OPT;
-#else
-   gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
-#endif
   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_scrypt;
   opt_target_factor = 65536.0;
@@ -1477,8 +1487,9 @@ bool register_scrypt_algo( algo_gate_t* gate )
         scratchbuf_size = opt_param_n * 2 * 128;  // 2 buf
       else
         scratchbuf_size = opt_param_n * 4 * 128;  // 4 way
+     break;
     default:
-         scratchbuf_size = opt_param_n;  // 1 way
+         scratchbuf_size = opt_param_n * 128;  // 1 way
   }

   char t_units[4] = {0};
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };

-#if defined(__AVX2__) && defined(__SHA512__)
+#if defined(__AVX__) && defined(__SHA512__)

 // SHA-512 implemented using SHA512 CPU extension.

--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -5,7 +5,7 @@
 #include "simd-utils.h"
 #include "sph_sha2.h"

-#if defined(__SHA512__) && defined(__AVX2__)
+#if defined(__SHA512__) && defined(__AVX__)

 // Experimental, untested
 // Need to substitute for sph_sha512
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -305,7 +305,7 @@ do { \
   xb0 = mm512_rol_32( xb0, 1 ); \
   xa0 = mm512_xor3( xm, xb1, \
                     mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm512_xnor( xa0, xb0 ); \
+   xb0 = mm512_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_16  do { \
@@ -898,7 +898,7 @@ do { \
   xb0 = mm256_rol_32( xb0, 1 ); \
   xa0 = mm256_xor3( xm, xb1, \
                     mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm256_xnor( xa0, xb0 ); \
+   xb0 = mm256_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
   {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }},
 };

+#if defined(__AVX2__)
+
+static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
+                                   0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+#define V128_00FF _mm256_castsi256_si128( V256_00FF )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+
+#endif
+
+#if defined(SIMD512)
+
+static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V256_0101 _mm512_castsi512_si256( V512_0101 )
+#define V128_0101 _mm512_castsi512_si128( V512_0101 )
+
+
+static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V256_0080 _mm512_castsi512_si256( V512_0080 )
+#define V128_0080 _mm512_castsi512_si128( V512_0080 )
+
+#elif defined(__AVX2__)
+
+static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V128_0101 _mm256_castsi256_si128( V256_0101 )
+
+static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V128_0080 _mm256_castsi256_si128( V256_0080 )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
+
+static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
+
+#endif
+
 #if defined(__x86_64__)

 #define SHUFXOR_1(x)        _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
 #define shufxor(x,s)   XCAT(SHUFXOR_,s)(x) 

 #define REDUCE(x) \
-  v128_sub16( v128_and( x, v128_64( \
-                         0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
+  v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  v128_sub16( x, v128_and( \
-          v128_64( 0x0101010101010101 ), \
-          v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
+  v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -293,10 +337,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    v128u16_t t1= X(i); \
-    v128u16_t t2= X(j); \
-    X(i) = v128_unpacklo16( t1, t2 ); \
-    X(j) = v128_unpackhi16( t1, t2 ); \
+    v128u16_t t = X(i); \
+    X(i) = v128_unpacklo16( t, X(j) ); \
+    X(j) = v128_unpackhi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =

 #define shufxor2w(x,s)      XCAT(SHUFXOR_,s)(x)

-#if defined(VL256)
-
 #define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
-                    _mm256_srai_epi16( x, 8 ) )
-#else
-
-#define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
-                         0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
-
-#endif
+  _mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  _mm256_sub_epi16( x, _mm256_and_si256( \
-          _mm256_set1_epi64x( 0x0101010101010101 ), \
-          _mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
+  _mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
+                                         _mm256_cmpgt_epi16( x, V256_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -917,10 +949,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    __m256i t1= X(i); \
-    __m256i t2= X(j); \
-    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
-    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
+    __m256i t = X(i); \
+    X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
+    X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
                    _mm512_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S4w(x) \
-  _mm512_sub_epi16( x, _mm512_and_si512( \
-             _mm512_set1_epi64( 0x0101010101010101 ), \
-             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
-                             x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
+  _mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
+             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )

 // generic, except it calls targetted macros
 #define REDUCE_FULL_S4w( x )  EXTRA_REDUCE_S4w( REDUCE4w (x ) )
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 #if defined(__AVX2__)

   __m256i F0, F1, F2, F3, F4, F5, F6, F7;
-   __m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
+   __m256i *table = (__m256i*)fftTable;
+   __m256i tbl = table[ input[0] ];
   __m256i *mul = (__m256i*)multipliers;
   __m256i *out = (__m256i*)output;

   F0 = _mm256_mullo_epi32( mul[0], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
+   tbl = table[ input[1] ];
   F1 = _mm256_mullo_epi32( mul[1], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
+   tbl = table[ input[2] ];
   F2 = _mm256_mullo_epi32( mul[2], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
+   tbl = table[ input[3] ];
   F3 = _mm256_mullo_epi32( mul[3], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
+   tbl = table[ input[4] ];
   F4 = _mm256_mullo_epi32( mul[4], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
+   tbl = table[ input[5] ];
   F5 = _mm256_mullo_epi32( mul[5], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
+   tbl = table[ input[6] ];
   F6 = _mm256_mullo_epi32( mul[6], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
+   tbl = table[ input[7]  ];
   F7 = _mm256_mullo_epi32( mul[7], tbl );

   #define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
   ADD_SUB( F1, F3 );
   ADD_SUB( F4, F6 );
   ADD_SUB( F5, F7 );  
-   F5 = _mm256_slli_epi32( F5, 2 );
   F6 = _mm256_slli_epi32( F6, 4 );
   F7 = _mm256_slli_epi32( F7, 6 );
+   F5 = _mm256_slli_epi32( F5, 2 );
   ADD_SUB( F0, F4 );
   ADD_SUB( F1, F5 );
   ADD_SUB( F2, F6 );
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -4,11 +4,11 @@
 # during development. However, the information contained may provide compilation
 # tips to users.

-rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null

 # armv9 needs gcc-13
 # -march-armv9-a includes SVE2 but no crypto
-# -march=armv9-a+crypto adds AES & SHA2 but not SHA512
+# -march=armv9-a+crypto adds AES & SHA256 but not SHA512

 make distclean || echo clean
 rm -f config.status
@@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure  --with-c
 make -j $(nproc)
 mv cpuminer cpuminer-armv9

+# Apple M4: armv9.2, AES, SHA3, SVE2
+make clean || echo clean
+CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-m4
+
+# Apple M2: armv8.6, AES, SHA3
+make clean || echo clean
+CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-m2
+
 # SVE2 available in armv8.5
 make clean || echo clean
 CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2

-# SHA3 available in armv8.4
+# Apple M1: armv8.4 AES, SHA3
 make clean || echo clean
 CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 mv cpuminer cpuminer-armv8.4-crypto-sha3

+# Cortex-A76 (Orange Pi 5)
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-armv8.2-crypto
+
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -34,9 +34,7 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-arrowlake-s

-# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
-# Granitrapids does not build with AVX10, SHA512 or APX.
-# wait for Diamondrapids & gcc-15.
+# Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
@@ -44,13 +42,13 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-graniterapids

-# SHA512 AVX10.1
-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-avx10_1
+# Graniterapids + SHA512, AVX10.1
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-avx10.1

 # SHA512 AVX10.2
 #make clean || echo clean
@@ -72,6 +70,9 @@ mv cpuminer cpuminer-graniterapids
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
+# zen4 is close enough for older compiler
+#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
+
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-zen5
@@ -138,13 +139,21 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx

-# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
+# SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-sse42-aes-sha
+
+# SSE4.2 AES: Intel Westmere, older Pentium & Celeron
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-aes-sse42
+mv cpuminer cpuminer-sse42-aes

 # SSE4.2: Intel Nehalem
 make clean || echo clean
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null
+rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null

-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
+rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null

 make distclean > /dev/null
--- a/28
+++ b/28
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 26.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.6'
-PACKAGE_STRING='cpuminer-opt 25.6'
+PACKAGE_VERSION='26.1'
+PACKAGE_STRING='cpuminer-opt 26.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 26.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1431,7 +1431,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 26.1:";;
   esac
  cat <<\_ACEOF

@@ -1536,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.6
+cpuminer-opt configure 26.1
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.6, which was
+It was created by cpuminer-opt $as_me 26.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.6'
+ VERSION='26.1'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -5808,11 +5808,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
 printf %s "checking for $CXX option to enable C++11 features... " >&6; }
-if test ${ac_cv_prog_cxx_cxx11+y}
+if test ${ac_cv_prog_cxx_11+y}
 then :
  printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_cxx11=no
+  ac_cv_prog_cxx_11=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -5854,11 +5854,11 @@ if test x$ac_prog_cxx_stdcxx = xno
 then :
  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
 printf %s "checking for $CXX option to enable C++98 features... " >&6; }
-if test ${ac_cv_prog_cxx_cxx98+y}
+if test ${ac_cv_prog_cxx_98+y}
 then :
  printf %s "(cached) " >&6
 else $as_nop
-  ac_cv_prog_cxx_cxx98=no
+  ac_cv_prog_cxx_98=no
 ac_save_CXX=$CXX
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.6, which was
+This file was extended by cpuminer-opt $as_me 26.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.6
+cpuminer-opt config.status 26.1
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [25.6])
+AC_INIT([cpuminer-opt], [26.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6.
+# Generated by GNU Autoconf 2.72 for cpuminer-opt 26.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.6'
-PACKAGE_STRING='cpuminer-opt 25.6'
+PACKAGE_VERSION='26.1'
+PACKAGE_STRING='cpuminer-opt 26.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
+'configure' configures cpuminer-opt 26.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1424,7 +1424,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 26.1:";;
   esac
  cat <<\_ACEOF

@@ -1528,7 +1528,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.6
+cpuminer-opt configure 26.1
 generated by GNU Autoconf 2.72

 Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.6, which was
+It was created by cpuminer-opt $as_me 26.1, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3768,7 +3768,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.6'
+ VERSION='26.1'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7581,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.6, which was
+This file was extended by cpuminer-opt $as_me 26.1, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7649,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.6
+cpuminer-opt config.status 26.1
 configured by $0, generated by GNU Autoconf 2.72,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -921,39 +921,32 @@ out:
   return rc;
 }

-// returns the unit prefix and the hashrate appropriately scaled.
-void scale_hash_for_display ( double* hashrate, char* prefix )
-{
-       if ( *hashrate < 1e4  )    *prefix =  0;
-  else if ( *hashrate < 1e7  )  { *prefix = 'k';  *hashrate /= 1e3;  }
-  else if ( *hashrate < 1e10 )  { *prefix = 'M';  *hashrate /= 1e6;  }  
-  else if ( *hashrate < 1e13 )  { *prefix = 'G';  *hashrate /= 1e9;  }
-  else if ( *hashrate < 1e16 )  { *prefix = 'T';  *hashrate /= 1e12; }
-  else if ( *hashrate < 1e19 )  { *prefix = 'P';  *hashrate /= 1e15; }
-  else if ( *hashrate < 1e22 )  { *prefix = 'E';  *hashrate /= 1e18; }
-  else if ( *hashrate < 1e25 )  { *prefix = 'Z';  *hashrate /= 1e21; } 
-  else                          { *prefix = 'Y';  *hashrate /= 1e24; }
-}
-
+// Does not account for leap years.
 static inline void sprintf_et( char *str, long unsigned int seconds )
 {
-   long unsigned int min = seconds / 60;
-   long unsigned int sec = seconds % 60;
-   long unsigned int hrs = min / 60;
-   
-   if ( unlikely( hrs ) )   
+   long unsigned int minutes = seconds / 60;
+   if ( minutes )
+   {
+      long unsigned int hours = minutes / 60;
+      if ( hours )
+      {
+         long unsigned int days = hours / 24;
+         if ( days )
         {
-      long unsigned int days = hrs / 24;
            long unsigned int years = days / 365;
-      if ( years )      // 0y000d
-         sprintf( str, "%luy%lud", years, years % 365 );
-      else if ( days )  // 0d00h
-         sprintf( str, "%lud%02luh", days, hrs % 24 );
-      else         // 0h00m  
-         sprintf( str, "%luh%02lum", hrs, min % 60 );
+            if ( years )   
+               sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d
+            else
+               sprintf( str, "%lud%02luh", days, hours % 24 );  // 0d00h
         }
-   else         // 0m00s
-      sprintf( str, "%lum%02lus", min, sec );
+         else
+            sprintf( str, "%luh%02lum", hours, minutes % 60 );  // 0h00m
+      }
+      else
+         sprintf( str, "%lum%02lus", minutes, seconds % 60 );   // 0m00s
+   }
+   else
+      sprintf( str, "%lus", seconds );   // 0s
 }      

 const long double exp32  = EXP32;                                 // 2**32
@@ -1239,7 +1232,7 @@ static int share_result( int result, struct work *work,
     sprintf( ares, "A%d", accepted_share_count );
     sprintf( bres, "B%d", solved_block_count );
     if ( reason )
-        stale = strstr( reason, "job" );
+        stale = strstr( reason, "job" ) || strstr( reason, "Job" );
     else if ( work )
        stale =  work->data[ algo_gate.ntime_index ]
             != g_work.data[ algo_gate.ntime_index ];
@@ -2833,67 +2826,29 @@ static void show_credits()
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
-     bool cpu_has_sse2     = has_sse2();     // X86_64 only
-     bool cpu_has_ssse3    = has_ssse3();    // X86_64 only
-     bool cpu_has_sse41    = has_sse41();    // X86_64 only
-     bool cpu_has_sse42    = has_sse42();
-     bool cpu_has_avx      = has_avx();
-     bool cpu_has_neon     = has_neon();     // AArch64 
-     bool cpu_has_sve      = has_sve();      // aarch64 only, insignificant
-     bool cpu_has_sve2     = has_sve2();     // AArch64 only
-     bool cpu_has_sme      = has_sme();
-     bool cpu_has_sme2     = has_sme2();  
-     bool cpu_has_avx2     = has_avx2(); 
-     bool cpu_has_avx512   = has_avx512();
-     bool cpu_has_avx10    = has_avx10();
-     bool cpu_has_aes      = has_aes();      // x86_64 or AArch64
-     bool cpu_has_vaes     = has_vaes();     // X86_64 only
-     bool cpu_has_sha256   = has_sha256();   // x86_64 or AArch64
-     bool cpu_has_sha512   = has_sha512();
     bool sw_has_x86_64    = false;
     bool sw_has_aarch64   = false;
     int  sw_arm_arch      = 0;            // AArch64 version
     bool sw_has_neon      = false;        // AArch64
-     bool sw_has_sve       = false;        // AArch64
-     bool sw_has_sve2      = false;        // AArch64
+     bool sw_has_sve       = false;
+     bool sw_has_sve2      = false;
     bool sw_has_sme       = false;  
     bool sw_has_sme2      = false; 
     bool sw_has_sse2      = false;        // x86_64
-     bool sw_has_ssse3     = false;        // x86_64
-     bool sw_has_sse41     = false;        // x86_64
+     bool sw_has_ssse3     = false;
+     bool sw_has_sse41     = false;
     bool sw_has_sse42     = false;
     bool sw_has_avx       = false;
     bool sw_has_avx2      = false;
     bool sw_has_avx512    = false;
     bool sw_has_avx10     = false;
-     bool sw_has_aes       = false;
-     bool sw_has_vaes      = false;
+     bool sw_has_amx       = false;
+     bool sw_has_apx       = false;
+     bool sw_has_aes       = false;        // x86_64 or AArch64
+     bool sw_has_vaes      = false;        // x86_64
     bool sw_has_sha256    = false;        // x86_64 or AArch64
-     bool sw_has_sha512    = false;        // x86_64 or AArch64
-/*
-     set_t algo_features   = algo_gate.optimizations;
-     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
-     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
-     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
-     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
-     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
-     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
-     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_sha256  = set_incl( SHA256_OPT,  algo_features );
-     bool algo_has_sha512  = set_incl( SHA512_OPT,  algo_features );
-     bool algo_has_neon    = set_incl( NEON_OPT,    algo_features );
-     bool use_sse2;
-     bool use_sse42;
-     bool use_avx;
-     bool use_avx2;
-     bool use_avx512;
-     bool use_aes;
-     bool use_vaes;
-     bool use_sha256;
-     bool use_sha512;
-     bool use_neon;
-     bool use_none;
-*/
+     bool sw_has_sha512    = false;
+
     #if defined(__x86_64__)
         sw_has_x86_64 = true;
     #elif defined(__aarch64__)
@@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only )
     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
         sw_has_avx512 = true;
     #endif
-// AVX10 version is not significant as of AVX10.2. If that changes use a better
-// way to test the version than sequentially.
-//     #if defined(__AVX10_2__)
-// 
-//     #elif defined(__AVX10_1__)
-     #if defined(__AVX10_1__)
+     #if defined(__AVX10_1__)    // version is not significant
         sw_has_avx10 = true;
     #endif
+     #ifdef __AMX_TILE__
+         sw_has_amx = true;
+     #endif
+     #ifdef __APX_F__
+         sw_has_apx = true;
+     #endif

     // x86_64 or AArch64 
     #if defined(__AES__) || defined(__ARM_FEATURE_AES)
@@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only )
     #if defined(__ARM_NEON)
         sw_has_neon = true;
     #endif
+     // FYI, SVE & SME not used by cpuminer
     #if defined(__ARM_FEATURE_SVE)
         sw_has_sve = true;
     #endif
@@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only )
     // Build
     printf( "SW built on " __DATE__
     #if defined(__clang__)
-        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
-                                __clang_patchlevel__ );
+        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ );
     #elif defined(__GNUC__)
        " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
     #endif
@@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only )
     printf("CPU features: ");
     if ( cpu_arch_x86_64()  )
     {
-       if      ( cpu_has_avx10  )    printf( " AVX10.%d", avx10_version() );
-       if      ( cpu_has_avx512 )    printf( " AVX512" );
-       else if ( cpu_has_avx2   )    printf( " AVX2  " );
-       else if ( cpu_has_avx    )    printf( " AVX   " );
-       else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-       else if ( cpu_has_sse41  )    printf( " SSE4.1" );
-       else if ( cpu_has_ssse3  )    printf( " SSSE3 " );
-       else if ( cpu_has_sse2   )    printf( " SSE2  " );
+       if      ( has_avx10()  )    printf( " AVX10.%d", avx10_version() );
+       else if ( has_avx512() )    printf( " AVX512" );
+       else if ( has_avx2()   )    printf( " AVX2  " );
+       else if ( has_avx()    )    printf( " AVX   " );
+       else if ( has_sse42()  )    printf( " SSE4.2" );
+       else if ( has_sse41()  )    printf( " SSE4.1" );
+       else if ( has_ssse3()  )    printf( " SSSE3 " );
+       else if ( has_sse2()   )    printf( " SSE2  " );
+       if      ( has_amx()    )    printf( " AMX"    );
+       if      ( has_apx_f()  )    printf( " APX"    );
+
     }
     else if   ( cpu_arch_aarch64() )
     {
-       if      ( cpu_has_neon   )    printf( "       NEON" );
-       if      ( cpu_has_sve2   )    printf( " SVE2-%d", sve_vector_length() );
-       else if ( cpu_has_sve    )    printf( " SVE"    );
-       if      ( cpu_has_sme2   )    printf( " SME2"   );
-       else if ( cpu_has_sme    )    printf( " SME"    );
+       if      ( has_neon()   )    printf( "       NEON" );
+       if      ( has_sve2()   )    printf( " SVE2-%d", sve_vector_length() );
+       else if ( has_sve()    )    printf( " SVE"    );
+       if      ( has_sme2()   )    printf( " SME2"   );
+       else if ( has_sme()    )    printf( " SME"    );
     }     
-     if        ( cpu_has_vaes   )    printf( " VAES"   );
-     else if   ( cpu_has_aes    )    printf( "  AES"   );
-     if        ( cpu_has_sha512 )    printf( " SHA512" );
-     else if   ( cpu_has_sha256 )    printf( " SHA256" );
+     if        ( has_vaes()   )    printf( " VAES"   );
+     else if   ( has_aes()    )    printf( "  AES"   );
+     if        ( has_sha512() )    printf( " SHA512" );
+     else if   ( has_sha256() )    printf( " SHA256" );

     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
@@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only )
        else if ( sw_has_sse41     ) printf( " SSE4.1" );
        else if ( sw_has_ssse3     ) printf( " SSSE3 " );
        else if ( sw_has_sse2      ) printf( " SSE2  " );
+        if      ( sw_has_amx       ) printf( " AMX"    );
+        if      ( sw_has_apx       ) printf( " APX"    );
     }
     else if    ( sw_has_aarch64 ) 
     {
--- a/miner.h
+++ b/miner.h
@@ -542,7 +542,9 @@ void applog_hash(void *hash);
 void format_hashrate(double hashrate, char *output);
 void print_hash_tests(void);

+// Factors of 1000 used for hashes, ie kH/s, Mh/s.
 void scale_hash_for_display ( double* hashrate, char* units );
+// Factors of 1024 used for bytes, ie kiB, MiB.
 void format_number_si( double* hashrate, char* si_units );
 void report_summary_log( bool force );

--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -447,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define v128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )
+#define v128_nxor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )

 #else

@@ -469,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )

-#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )
+#define v128_nxor( a, b )         v128_not( _mm_xor_si128( a, b ) )

 #endif

@@ -642,6 +642,15 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_shuflr32(v)    _mm_shuffle_epi32( v, 0x39 )
 #define v128_shufll32(v)    _mm_shuffle_epi32( v, 0x93 )

+/* Zen6 AMD only
+// Reverse bits in bytes
+#if defined(__AVX512VL__) && defined(__AVX512BMM__)
+
+#define v128_bitrev8           _mm_vbitrevb_epi8
+
+#endif
+*/
+
 // Endian byte swap.

 #if defined(__SSSE3__)
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_orand( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm256_xnor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
+#define mm256_nxor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
    
 #else

@@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_orand( a, b, c ) \
  _mm256_or_si256( a, _mm256_and_si256( b, c ) )

-#define mm256_xnor( a, b ) \
+#define mm256_nxor( a, b ) \
  mm256_not( _mm256_xor_si256( a, b ) )

 #endif
@@ -409,6 +409,15 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }
 */

+/* Zen6 AMD only
+// Reverse bits in bytes
+#if defined(__AVX512VL__) && defined(__AVX512BMM__)
+
+#define mm256_bitrev8           _mm256_vbitrevb_epi8
+
+#endif
+*/
+
 // Reverse byte order in elements, endian bswap.
 #define mm256_bswap_64( v )     _mm256_shuffle_epi8( v, V256_BSWAP64 )

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -18,8 +18,13 @@

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
-//    This removes the need for an explicit movemask instruction.
+//    "_mm512_cmp" instructions now return a bitmask instead of a vector mask.
+//    This removes the need for an explicit movemask instruction. It is also
+//    slower than AVX2 cmp. There is no version of AVX512 cmp that returns a
+//    vector result resulting in a double penalty if a vector result is needed:
+//    slower cmp instruction & extra instruction to convert bit mask into 
+//    vector mask. 256 bit & 128 bit still have legacy cmp returning vector
+//    while AVX512VL adds masked versions returning bit mask.
 //
 //    Many previously sizeless (si) instructions now have sized (epi) versions
 //    to accomodate masking packed elements.
@@ -30,7 +35,7 @@
 //    list.
 //
 //    "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
-//    AVX512 permutes can cross all lanes.
+//    AVX512 instructions using the permute name can cross all lanes.
 //
 //    New alignr instructions for epi64 and epi32 operate across the entire
 //    vector but slower than epi8 which continues to be restricted to 128 bit
@@ -50,16 +55,17 @@
 //        parentheses to ensure the expression argument is evaluated first.
 //      - if an argument is to referenced multiple times a C inline function
 //        should be used instead of a macro to prevent an expression argument
-//        from being evaluated multiple times (wasteful) or produces side
+//        from being evaluated multiple times (wasteful) or produce side
 //        effects (very bad).
 //
 //    There are 2 areas where overhead is a major concern: constants and
 //    permutations.
 //
 //    Constants need to be composed at run time by assembling individual
-//    elements, very expensive. The cost is proportional to the number of
-//    different elements therefore use the largest element size possible,
-//    merge smaller integer elements to 64 bits, and group repeated elements.
+//    elements or loaded from memory, very expensive. The cost of runtime
+//    construction is proportional to the number of different elements
+//    therefore use the largest element size possible merging smaller integer
+//    elements to 64 bits, and group repeated elements.
 //
 //    Constants with repeating patterns can be optimized with the smaller
 //    patterns repeated more frequently being more efficient.
@@ -67,14 +73,15 @@
 //    Some specific constants can be very efficient. Zero is very efficient,
 //    1 and -1 slightly less so. 
 //
-//    If an expensive constant is to be reused in the same function it should
-//    be declared as a local variable defined once and reused.
+//    If an expensive constant is to be reused in the same function it may
+//    be declared as a local variable defined once and reused. If frequently
+//    used it can be declared as a static const in memory.
 //
 //    Permutations can be very expensive if they use a vector control index,
-//    even if the permutation itself is quite efficient.
-//    The index is essentially a constant with all the baggage that brings.
-//    The same rules apply, if an index is to be reused it should be defined
-//    as a local. This applies specifically to bswap operations.
+//    even if the permute instruction itself is quite efficient.
+//    The index is essentially a vector constant with all the baggage that
+//    brings. The same rules apply, if an index is to be reused it should either
+//    be defined as a local or static const.
 //
 //    Permutations that cross 128 bit lanes are typically slower and often need
 //    a vector control index. If the permutation doesn't need to cross 128 bit
@@ -221,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_nor( a, b )          _mm512_ternarylogic_epi64( a, b, b, 0x01 )

 // ~( a ^ b ),  (~a) ^ b
-#define mm512_xnor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )
+#define mm512_nxor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )

 // ~( a & b )
 #define mm512_nand( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0xef )
@@ -241,6 +248,24 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

+/* not used
+#if defined(__AVX512VBMI2__)
+
+#define mm512_ror_16( v, c )   _mm512_shrdi_epi16( c, v, v )
+#define mm512_rol_16( v, c )   _mm512_shldi_epi16( c, v, v )
+
+#endif
+*/
+
+/* Zen6 AMD only
+// Reverse bits in bytes
+#if defined(__AVX512BMM__)
+
+#define mm512_bitrev8           _mm512_vbitrevb_epi8
+
+#endif
+*/
+
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

@@ -249,9 +274,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_bswap_32( v )  _mm512_shuffle_epi8( v, V512_BSWAP32 )

 /* not used
+#if defined(__AVX512VBMI2__)
+
+#define mm512_bswap_16( v )  mm512_ror_16( v, 8 )
+
+#else
+
 #define mm512_bswap_16( v ) \
   _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
                                0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+
+#endif
 */

 #define mm512_bswap_16( v ) \
@@ -431,8 +464,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
                                           _mm512_castsi512_ps( v2 ), c ) ); 

 // 64 bit lanes
-// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
-
+// Redundant with ror & rol but included for consistency with AVX2/SSE.
 #define mm512_qrev32( v )       _mm512_shuffle_epi32( v, 0xb1 )
 #define mm512_swap64_32         mm512_qrev32        // grandfathered

--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -126,7 +126,7 @@
 #define v128_andnot( v1, v0 )         vbicq_u32( v0, v1 )

 // ~( v1 ^ v0 ), same as (~v1) ^ v0
-#define v128_xnor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )
+#define v128_nxor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )

 // ~v1 | v0,  args reversed for consistency with x86_64
 #define v128_ornot( v1, v0 )          vornq_u32( v0, v1 )
@@ -349,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
     vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )

 // Bit rotation
-/*
-#define v128_shuflr64_8( v )        v128_shuffle8( v, V128_SHUFLR64_8 )
-#define v128_shufll64_8( v )        v128_shuffle8( v, V128_SHUFLL64_8 )
-#define v128_shuflr64_16(v )        v128_shuffle8( v, V128_SHUFLR64_16 )
-#define v128_shufll64_16(v )        v128_shuffle8( v, V128_SHUFLL64_16 )
-#define v128_shuflr64_24(v )        v128_shuffle8( v, V128_SHUFLR64_24 )
-#define v128_shufll64_24(v )        v128_shuffle8( v, V128_SHUFLL64_24 )
-#define v128_shuflr32_8( v )        v128_shuffle8( v, V128_SHUFLR32_8 )
-#define v128_shufll32_8( v )        v128_shuffle8( v, V128_SHUFLL32_8 )
-
-#define v128_ror64( v, c ) \
-   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
- : ( (c) == 16 ) ? v128_shuflr64_16( v ) \
- : ( (c) == 24 ) ? v128_shuflr64_24( v ) \
- : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
- : ( (c) == 40 ) ? v128_shufll64_24( v ) \
- : ( (c) == 48 ) ? v128_shufll64_16( v ) \
- : ( (c) == 56 ) ? v128_shufll64_8( v ) \
- :                 vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
-                                ((uint64x2_t)(v)), c )
-
-#define v128_rol64( v, c ) \
-   ( (c) ==  8 ) ? v128_shufll64_8( v ) \
- : ( (c) == 16 ) ? v128_shufll64_16( v ) \
- : ( (c) == 24 ) ? v128_shufll64_24( v ) \
- : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
- : ( (c) == 40 ) ? v128_shuflr64_24( v ) \
- : ( (c) == 48 ) ? v128_shuflr64_16( v ) \
- : ( (c) == 56 ) ? v128_shuflr64_8( v ) \
- :                 vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
-                                ((uint64x2_t)(v)), c )
-
-#define v128_ror32( v, c ) \
-   ( (c) ==  8 ) ? v128_shuflr32_8( v ) \
- : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
- : ( (c) == 24 ) ? v128_shufll32_8( v ) \
- :                 vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
-                                ((uint32x4_t)(v)), c )
-
-#define v128_rol32( v, c ) \
-   ( (c) ==  8 ) ? v128_shufll32_8( v ) \
- : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
- : ( (c) == 24 ) ? v128_shuflr32_8( v ) \
- :                 vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
-                                ((uint32x4_t)(v)), c )
-*/

 #define v128_ror64( v, c ) \
  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
@@ -477,8 +431,10 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
 static inline uint32x4_t v128_shufll32( uint32x4_t v )
 {   return vextq_u32( v, v, 3 ); }

-// reverse bits in bytes, nothing like it in x86_64
+/* not used
+// reverse bits in bytes, nothing like it in x86_64 until Zen6
 #define v128_bitrev8           vrbitq_u8
+*/

 // reverse byte order
 #define v128_bswap16(v)        (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -16,8 +16,8 @@
 #include "miner.h"
 #include "simd-utils.h"

-// Missing on MinGW, MacOS
-#if defined(__aarch64__) && !defined(WIN32) && !defined(__APPLE__)
+// hwcap.h missing on MinGW, MacOS
+#if defined(__aarch64__) && !(defined(WIN32) || defined(__APPLE__)) 
 #define ARM_AUXV 
 #endif

@@ -191,6 +191,7 @@ static inline int cpu_fanpercent()
 #define CPU_INFO             (1)
 #define CACHE_TLB_DESCRIPTOR (2)
 #define EXTENDED_FEATURES    (7)
+#define EXTENDED_FEATURE_ID  (0x21)
 #define AVX10_FEATURES       (0x24)
 #define HIGHEST_EXT_FUNCTION (0x80000000)
 #define EXTENDED_CPU_INFO    (0x80000001)
@@ -254,8 +255,8 @@ static inline int cpu_fanpercent()
 #define AVX512_BF16_Flag         (1<< 5)
 #define AMX_FP16_Flag            (1<<21)
 #define AVX_IFMA_Flag            (1<<23)
-#define MOVRS_Flag               (1<<31)  // Both names are referenced in docs
 #define AVX10_MOVRS_Flag         (1<<31)
+#define MOVRS_Flag               (1<<31)  // Both names are referenced in docs
 // EDX
 #define AVX_VNNI_INT8_Flag       (1<< 4)
 #define AVX_NE_CONVERT_Flag      (1<< 5)
@@ -264,6 +265,10 @@ static inline int cpu_fanpercent()
 #define AVX10_Flag               (1<<19)
 #define APX_F_Flag               (1<<21)

+// EXTENDED_FEATURE_ID: EAX=0x21, ECX=0
+// EAX
+#define AVX512_BMM_Flag          (1<<23)   // Zen6 AMD only
+
 // AVX10_FEATURES: EAX=0x24, ECX=0
 // EBX
 #define AVX10_VERSION_mask        0xff      // bits [7:0]
@@ -474,6 +479,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 }
 
 /*
+// ARM feature compiler flags
 #ifdef __aarch64__
 #warning "__aarch64__"
 #endif
@@ -509,16 +515,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #endif
 */

-// Typical display format: AVX10.[version]_[vectorlength], if vector length is
-// omitted 256 is the default.
-//    Ex: AVX10.1_512
-// Flags:
-// AVX10  128  256  512
-//   0     0    0    0    = AVX10 not supported
-//   1     1    1    0    = AVX10 256 bit max  (version 2)
-//   1     1    1    1    = AVX10 512 bit max  (version 1 granite rapids)
-// Other combinations are not defined.
-
 static inline bool cpu_arch_x86_64()
 {
 #if defined(__x86_64__)
@@ -744,7 +740,7 @@ static inline bool has_avx512()
 #endif
 }

-static inline bool has_vbmi()
+static inline bool has_avx512vbmi()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
@@ -755,7 +751,7 @@ static inline bool has_vbmi()
 #endif
 }

-static inline bool has_vbmi2()
+static inline bool has_avx512vbmi2()
 {
 #if defined(__x86_64__)
   unsigned int cpu_info[4] = { 0 };
@@ -766,6 +762,29 @@ static inline bool has_vbmi2()
 #endif
 }

+// Zen6 AMD only
+static inline bool has_avx512bmm()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURE_ID, 0, cpu_info );
+   return cpu_info[ EAX_Reg ] & AVX512_BMM_Flag;
+#else
+   return false;
+#endif
+}
+
+static inline bool has_amx()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EDX_Reg ] & AMX_TILE_Flag;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_aes()
 {
 #if defined(__x86_64__)
@@ -815,13 +834,9 @@ static inline bool has_sveaes()
 static inline bool has_sha256()
 {
 #if defined(__x86_64__)
-   if ( has_avx() )
-   {
   unsigned int cpu_info[4] = { 0 };
   cpuid( EXTENDED_FEATURES, 0, cpu_info );
   return cpu_info[ EBX_Reg ] & SHA_Flag;
-   }
-   return false;
 #elif defined(__aarch64__) && defined(HWCAP_SHA2)
   // NEON SHA256
   unsigned int cpu_info[4] = { 0 };
@@ -835,7 +850,7 @@ static inline bool has_sha256()
 static inline bool has_sha512()
 {
 #if defined(__x86_64__)
-   if ( has_avx2() )
+   if ( has_avx() )
   {
      unsigned int cpu_info[4] = { 0 };
      cpuid( EXTENDED_FEATURES, 1, cpu_info );
@@ -852,7 +867,7 @@ static inline bool has_sha512()
 #endif
 }

-// Arm only
+// ARM64 only
 static inline bool has_sha3()
 {
 #if defined(__aarch64__) && defined(HWCAP_SHA3)
@@ -944,16 +959,6 @@ static inline int sve_vector_length()
   return 0;
 }

-// Assume min_vlen refers to the register size
-static inline int rvv_vector_length()
-{
-#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
-   return __riscv_v_min_vlen;
-#endif
-   return 0;
-}
-
-// generic
 static inline int vector_length()
 {
 #if defined(__x86_64__)
@@ -965,8 +970,8 @@ static inline int vector_length()
   return has_sve()    ? sve_vector_length()
        : has_neon()   ? 128
        :                0;
-#elif defined(__riscv) && defined(__riscv_vector)
-   return rvv_vector_length();
+#elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
+   return __riscv_v_min_vlen;
 #endif
   return 0;
 }
--- a/util.c
+++ b/util.c
@@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
 	free(cmd);
 }

-
-void format_hashrate(double hashrate, char *output)
+// Decimal SI, factors 0f 1000
+void scale_hash_for_display ( double* hashrate, char* prefix )
 {
-	char prefix = '\0';
-
-	if (hashrate < 10000) {
-		// nop
-	}
-	else if (hashrate < 1e7) {
-		prefix = 'k';
-		hashrate *= 1e-3;
-	}
-	else if (hashrate < 1e10) {
-		prefix = 'M';
-		hashrate *= 1e-6;
-	}
-	else if (hashrate < 1e13) {
-		prefix = 'G';
-		hashrate *= 1e-9;
-	}
-	else {
-		prefix = 'T';
-		hashrate *= 1e-12;
-	}
-
-	sprintf(
-		output,
-		prefix ? "%.2f %cH/s" : "%.2f H/s%c",
-		hashrate, prefix
-	);
+       if ( *hashrate < 1e4  )    *prefix =  0;
+  else if ( *hashrate < 1e7  )  { *prefix = 'k';  *hashrate /= 1e3;  }
+  else if ( *hashrate < 1e10 )  { *prefix = 'M';  *hashrate /= 1e6;  }
+  else if ( *hashrate < 1e13 )  { *prefix = 'G';  *hashrate /= 1e9;  }
+  else if ( *hashrate < 1e16 )  { *prefix = 'T';  *hashrate /= 1e12; }
+  else if ( *hashrate < 1e19 )  { *prefix = 'P';  *hashrate /= 1e15; }
+  else if ( *hashrate < 1e22 )  { *prefix = 'E';  *hashrate /= 1e18; }
+  else if ( *hashrate < 1e25 )  { *prefix = 'Z';  *hashrate /= 1e21; }
+  else                          { *prefix = 'Y';  *hashrate /= 1e24; }
 }

-// For use with MiB etc
+void format_hashrate( double hashrate, char *output )
+{
+	char prefix = '\0';
+   scale_hash_for_display( &hashrate, &prefix );
+	sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix	);
+}
+
+// Binary SI, factors of 1024
 void format_number_si( double* n, char* si_units )
 {
  if ( *n < 1024*10 )  {  *si_units = 0;   return;  }
--- a/verthash-help.txt
+++ b/verthash-help.txt
@@ -55,9 +55,9 @@ the current directory it will be created.

 Data file creation can take up to 30 minutes on a spinning hard drive. 
 Once created the new data file will be verified and used immediately
-if a valid url and user were included on the command line.
+if a valid url and user was included on the command line.

-A default data file can be created by ommitting the url option. That will
+A default data file can also be created by ommitting the url option. That will
 either verify an existing default data file or create one and verify it,
 then exit.
Author	SHA1	Message	Date
Jay D Dee	b34565bfac	v26.1	2026-01-13 19:17:47 -05:00
Jay D Dee	8f2f9ec3e9	v25.7	2025-11-15 10:44:32 -05:00