v25.7

2026-07-14 02:46:49 +00:00 · 2025-11-15 10:44:32 -05:00
parent 12480a3ea5
commit 8f2f9ec3e9
26 changed files with 474 additions and 452 deletions
--- a/8
+++ b/8
@@ -75,6 +75,14 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v25.7
+
+Fixed a bug calculating TTF longer than 1 year.
+Faster argon2d.
+Faster hamsi AVX512.
+Faster switfftx AVX2.
+Other small fixes and improvements.
+
 v25.6

 Added argon2d1000, argon2d16000 algos.
--- a/algo/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2d/blake2/blamka-round-opt.h
@@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #if defined(__SSSE3__)  || defined(__ARM_NEON)

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B1, B0, 8);                               \
-        v128_t t1 = v128_alignr8(B0, B1, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D1, D0, 8);                                       \
-        t1 = v128_alignr8(D0, D1, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+   v128_t t = v128_alignr8( B1, B0, 8 ); \
+   B1 = v128_alignr8( B0, B1, 8 ); \
+   B0 = t; \
+   t = v128_alignr8( D1, D0, 8 ); \
+   D0 = v128_alignr8( D0, D1, 8 ); \
+   D1 = t; \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0 = v128_alignr8(B0, B1, 8);                               \
-        v128_t t1 = v128_alignr8(B1, B0, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = v128_alignr8(D0, D1, 8);                                       \
-        t1 = v128_alignr8(D1, D0, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = v128_alignr8( B0, B1, 8 ); \
+    B1 = v128_alignr8( B1, B0, 8 ); \
+    B0 = t; \
+    t = v128_alignr8( D0, D1, 8 ); \
+    D0 = v128_alignr8( D1, D0, 8 ); \
+    D1 = t; \
+}

 #else /* SSE2 */

-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        v128_t t0 = D0;                                                       \
-        v128_t t1 = B0;                                                       \
-        D0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = D0;                                                               \
-        D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0));               \
-        D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1));               \
-        B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1));               \
-        B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
+#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = D0; \
+    D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
+    D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
+    t = B0; \
+    B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
+    B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
+}
+
+#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+    v128_t t = B0; \
+    B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
+    B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
+    t = D0; \
+    D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
+    D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
+}

-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        v128_t t0, t1;                                                        \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-        t0 = B0;                                                               \
-        t1 = D0;                                                               \
-        B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0));               \
-        B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1));               \
-        D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1));               \
-        D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1));               \
-    } while ((void)0, 0)
 #endif

-#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
-    do {                                                                       \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
-                                                                               \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
-    } while ((void)0, 0)
+#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
+{ \
+    G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+    G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
+    UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
+}
+
 #else /* __AVX2__ */

 #include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
-        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
-        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
-        C1 = tmp1; \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
    } while(0);

 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
-        \
        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do { \
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
-        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        tmp1 = C0; \
-        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        C0 = C1; \
+        B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
+        B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        C1 = tmp1; \
        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
-        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
+        D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
    } while((void)0, 0);

 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
-        \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
    } while((void)0, 0);

@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
    do{ \
        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
-        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
-        \
+        G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
    } while((void)0, 0);

@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)

 #include <immintrin.h>

+/*
 static inline __m512i muladd(__m512i x, __m512i y)
 {
    __m512i z = _mm512_mul_epu32(x, y);
    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
 }
+*/

+#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 32); \
+  D1 = _mm512_ror_epi64(D1, 32); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 24); \
+  B1 = _mm512_ror_epi64(B1, 24); \
+}
+
+#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
+{ \
+  __m512i z0, z1; \
+  z0 = _mm512_mul_epu32( A0, B0 ); \
+  z1 = _mm512_mul_epu32( A1, B1 ); \
+  A0 = _mm512_add_epi64( A0, B0 ); \
+  A1 = _mm512_add_epi64( A1, B1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  A0 = _mm512_add_epi64( A0, z0 ); \
+  A1 = _mm512_add_epi64( A1, z1 ); \
+  D0 = _mm512_xor_si512(D0, A0); \
+  D1 = _mm512_xor_si512(D1, A1); \
+  D0 = _mm512_ror_epi64(D0, 16); \
+  D1 = _mm512_ror_epi64(D1, 16); \
+  z0 = _mm512_mul_epu32( C0, D0 ); \
+  z1 = _mm512_mul_epu32( C1, D1 ); \
+  C0 = _mm512_add_epi64( C0, D0 ); \
+  C1 = _mm512_add_epi64( C1, D1 ); \
+  z0 = _mm512_add_epi64( z0, z0 ); \
+  z1 = _mm512_add_epi64( z1, z1 ); \
+  C0 = _mm512_add_epi64( C0, z0 ); \
+  C1 = _mm512_add_epi64( C1, z1 ); \
+  B0 = _mm512_xor_si512(B0, C0); \
+  B1 = _mm512_xor_si512(B1, C1); \
+  B0 = _mm512_ror_epi64(B0, 63); \
+  B1 = _mm512_ror_epi64(B1, 63); \
+}
+
+/*
 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 24); \
        B1 = _mm512_ror_epi64(B1, 24); \
    } while ((void)0, 0)
-
+*/
+/* 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
        B0 = _mm512_ror_epi64(B0, 63); \
        B1 = _mm512_ror_epi64(B1, 63); \
    } while ((void)0, 0)
+*/

 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
    } while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
-\
        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
-\
        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
    } while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
    do { \
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
-\
        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
    } while ((void)0, 0)

+static const __m512i swap_q0  = { 0,1,  8,9,  2,3,  10,11 }; 
+static const __m512i swap_q1  = { 4,5, 12,13, 6,7,  14,15 };
+static const __m512i uswap_q0 = { 0,1,  4,5,  8,9,  12,13 };
+static const __m512i uswap_q1 = { 2,3,  6,7, 10,11, 14,15 };
+
 #define SWAP_HALVES(A0, A1) \
    do { \
        __m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
        A0 = t; \
    } while((void)0, 0)

+#define SWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
+   A0 = t; \
+}   
+
+#define UNSWAP_QUARTERS(A0, A1) \
+{ \
+   __m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
+   A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
+   A0 = t; \
+}   
+   
+/*
 #define SWAP_QUARTERS(A0, A1) \
    do { \
        SWAP_HALVES(A0, A1); \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
    } while((void)0, 0)
-
+*/
+/*
 #define UNSWAP_QUARTERS(A0, A1) \
    do { \
        A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
        A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
        SWAP_HALVES(A0, A1); \
    } while((void)0, 0)
+*/

 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
    do { \
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
   mj[14] = mm256_rol_64( M[14], 15 );
   mj[15] = mm256_rol_64( M[15], 16 );

-   __m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
-   const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
+   __m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
+   static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
                        __m512i dH[16] )
 {
   __m512i qt[32], xl, xh;
-   __m512i mh[16];
+   __m512i mh[16], mj[16];
   int i;

   for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
   qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );

-   __m512i mj[16];
- 
   mj[ 0] = mm512_rol_64( M[ 0],  1 );
   mj[ 1] = mm512_rol_64( M[ 1],  2 );
   mj[ 2] = mm512_rol_64( M[ 2],  3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
   mj[14] = mm512_rol_64( M[14], 15 );
   mj[15] = mm512_rol_64( M[15], 16 );

-   __m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
-   const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
+   __m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
+   static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL,
+                                  0x0555555555555555ULL, 0x0555555555555555ULL };

   qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
   K = _mm512_add_epi64( K, Kincr );
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -503,32 +503,28 @@ do { \
  SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
  SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
  s4 = mm512_swap64_32( s4 ); \
-  s5 = mm512_swap64_32( s5 ); \
+  t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
  sD = mm512_swap64_32( sD ); \
-  sE = mm512_swap64_32( sE ); \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
-  t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
+  t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
  L8( s0, t0, s9, t1 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
+  t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
  L8( s1, t2, sA, t3 ); \
  s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
  sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
 \
-  s7 = mm512_swap64_32( s7 ); \
-  sC = mm512_swap64_32( sC ); \
-  t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
-  t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
+  t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
+  t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
  L8( s2, t4, sB, t5 ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
  sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
  s6 = mm512_swap64_32( s6 ); \
  sF = mm512_swap64_32( sF ); \
 \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
-  t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
+  t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
+  t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
  L8( s3, t2, s8, t3 ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
  s7 = mm512_swap64_32( s7 ); \
  sC = mm512_swap64_32( sC ); \
 \
-  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
+  t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
-  t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
+  t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
  t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
  t3 = mm512_swap64_32( t3 ); \
  L8( t0, t1, t2, t3 ); \
-  t3 = mm512_swap64_32( t3 ); \
  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
-  s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
+  s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
-  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
+  s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
-  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
-  sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
+  s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
+  sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1268,7 +1263,7 @@ do { \
 } while (0)
 #endif

-// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
+// v3, 15 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
 #endif

 /*
-/ v2, 16 instructions, 10 TL equivalent instructions
+/ v2, 16 instructions
 #define SBOX( a, b, c, d ) \
 { \
  __m256i t = mm256_xorand( d, a, c ); \
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
    __m512i t = a0; \
    a0 = mm512_xoror( a3, a0, a1 ); \
    a2 = _mm512_xor_si512( a2, a3 ); \
-    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm512_xorand( a2, a3, t ); \
    a2 = mm512_xorand( a1, a2, a0); \
    a1 = _mm512_or_si512( a1, a3 ); \
    a3 = _mm512_xor_si512( a3, a2 ); \
    t  = _mm512_xor_si512( t, a1 ); \
    a2 = _mm512_and_si512( a2, a1 ); \
-    a1 = mm512_xnor( a1, a0 ); \
+    a1 = mm512_nxor( a1, a0 ); \
    a0 = t; \
 }

@@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m256i t = a0; \
    a0 = mm256_xoror( a3, a0, a1 ); \
    a2 = _mm256_xor_si256( a2, a3 ); \
-    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
    a3 = mm256_xorand( a2, a3, t ); \
    a2 = mm256_xorand( a1, a2, a0); \
    a1 = _mm256_or_si256( a1, a3 ); \
    a3 = _mm256_xor_si256( a3, a2 ); \
    t  = _mm256_xor_si256( t, a1 ); \
    a2 = _mm256_and_si256( a2, a1 ); \
-    a1 = mm256_xnor( a1, a0 ); \
+    a1 = mm256_nxor( a1, a0 ); \
    a0 = t; \
 }

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -69,18 +69,18 @@
    v128_t t = a0; \
    a0 = v128_xoror( a3, a0, a1 ); \
    a2 = v128_xor( a2, a3 ); \
-    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
    a3 = v128_xorand( a2, a3, t ); \
    a2 = v128_xorand( a1, a2, a0 ); \
    a1 = v128_or( a1, a3 ); \
    a3 = v128_xor( a3, a2 ); \
    t  = v128_xor( t, a1 ); \
    a2 = v128_and( a2, a1 ); \
-    a1 = v128_xnor( a1, a0 ); \
+    a1 = v128_nxor( a1, a0 ); \
    a0 = t; \
 }

-#else
+#elif defined(__ARM_NEON) || defined(__SSE2__)

 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };

-#if defined(__AVX2__) && defined(__SHA512__)
+#if defined(__AVX__) && defined(__SHA512__)

 // SHA-512 implemented using SHA512 CPU extension.

--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -5,7 +5,7 @@
 #include "simd-utils.h"
 #include "sph_sha2.h"

-#if defined(__SHA512__) && defined(__AVX2__)
+#if defined(__SHA512__) && defined(__AVX__)

 // Experimental, untested
 // Need to substitute for sph_sha512
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -305,7 +305,7 @@ do { \
   xb0 = mm512_rol_32( xb0, 1 ); \
   xa0 = mm512_xor3( xm, xb1, \
                     mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm512_xnor( xa0, xb0 ); \
+   xb0 = mm512_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_16  do { \
@@ -898,7 +898,7 @@ do { \
   xb0 = mm256_rol_32( xb0, 1 ); \
   xa0 = mm256_xor3( xm, xb1, \
                     mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
-   xb0 = mm256_xnor( xa0, xb0 ); \
+   xb0 = mm256_nxor( xa0, xb0 ); \
 } while (0)

 #define PERM_STEP_0_8   do { \
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
   {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }},
 };

+#if defined(__AVX2__)
+
+static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
+                                   0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+#define V128_00FF _mm256_castsi256_si128( V256_00FF )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
+
+#endif
+
+#if defined(SIMD512)
+
+static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V256_0101 _mm512_castsi512_si256( V512_0101 )
+#define V128_0101 _mm512_castsi512_si128( V512_0101 )
+
+
+static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V256_0080 _mm512_castsi512_si256( V512_0080 )
+#define V128_0080 _mm512_castsi512_si128( V512_0080 )
+
+#elif defined(__AVX2__)
+
+static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
+                                   0x0101010101010101, 0x0101010101010101 };
+#define V128_0101 _mm256_castsi256_si128( V256_0101 )
+
+static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
+                                   0x0080008000800080, 0x0080008000800080 };
+#define V128_0080 _mm256_castsi256_si128( V256_0080 )
+
+#elif defined(__SSE2__) || defined(__ARM_NEON )
+
+static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
+
+static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
+
+#endif
+
 #if defined(__x86_64__)

 #define SHUFXOR_1(x)        _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
 #define shufxor(x,s)   XCAT(SHUFXOR_,s)(x) 

 #define REDUCE(x) \
-  v128_sub16( v128_and( x, v128_64( \
-                         0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
+  v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  v128_sub16( x, v128_and( \
-          v128_64( 0x0101010101010101 ), \
-          v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
+  v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -293,10 +337,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    v128u16_t t1= X(i); \
-    v128u16_t t2= X(j); \
-    X(i) = v128_unpacklo16( t1, t2 ); \
-    X(j) = v128_unpackhi16( t1, t2 ); \
+    v128u16_t t = X(i); \
+    X(i) = v128_unpacklo16( t, X(j) ); \
+    X(j) = v128_unpackhi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =

 #define shufxor2w(x,s)      XCAT(SHUFXOR_,s)(x)

-#if defined(VL256)
-
 #define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
-                    _mm256_srai_epi16( x, 8 ) )
-#else
-
-#define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
-                         0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
-
-#endif
+  _mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S(x)\
-  _mm256_sub_epi16( x, _mm256_and_si256( \
-          _mm256_set1_epi64x( 0x0101010101010101 ), \
-          _mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
+  _mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
+                                         _mm256_cmpgt_epi16( x, V256_0080 ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -917,10 +949,9 @@ do { \
  // This will make the full FFT_64 in order.
 #define INTERLEAVE(i,j) \
  do { \
-    __m256i t1= X(i); \
-    __m256i t2= X(j); \
-    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
-    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
+    __m256i t = X(i); \
+    X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
+    X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
  } while(0)

  INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
                    _mm512_srai_epi16( x, 8 ) )

 #define EXTRA_REDUCE_S4w(x) \
-  _mm512_sub_epi16( x, _mm512_and_si512( \
-             _mm512_set1_epi64( 0x0101010101010101 ), \
-             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
-                             x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
+  _mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
+             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )

 // generic, except it calls targetted macros
 #define REDUCE_FULL_S4w( x )  EXTRA_REDUCE_S4w( REDUCE4w (x ) )
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 #if defined(__AVX2__)

   __m256i F0, F1, F2, F3, F4, F5, F6, F7;
-   __m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
+   __m256i *table = (__m256i*)fftTable;
+   __m256i tbl = table[ input[0] ];
   __m256i *mul = (__m256i*)multipliers;
   __m256i *out = (__m256i*)output;

   F0 = _mm256_mullo_epi32( mul[0], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
+   tbl = table[ input[1] ];
   F1 = _mm256_mullo_epi32( mul[1], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
+   tbl = table[ input[2] ];
   F2 = _mm256_mullo_epi32( mul[2], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
+   tbl = table[ input[3] ];
   F3 = _mm256_mullo_epi32( mul[3], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
+   tbl = table[ input[4] ];
   F4 = _mm256_mullo_epi32( mul[4], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
+   tbl = table[ input[5] ];
   F5 = _mm256_mullo_epi32( mul[5], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
+   tbl = table[ input[6] ];
   F6 = _mm256_mullo_epi32( mul[6], tbl );
-   tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
+   tbl = table[ input[7]  ];
   F7 = _mm256_mullo_epi32( mul[7], tbl );

   #define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
   ADD_SUB( F1, F3 );
   ADD_SUB( F4, F6 );
   ADD_SUB( F5, F7 );  
-   F5 = _mm256_slli_epi32( F5, 2 );
   F6 = _mm256_slli_epi32( F6, 4 );
   F7 = _mm256_slli_epi32( F7, 6 );
+   F5 = _mm256_slli_epi32( F5, 2 );
   ADD_SUB( F0, F4 );
   ADD_SUB( F1, F5 );
   ADD_SUB( F2, F6 );
--- a/armbuild-all.sh
+++ b/armbuild-all.sh
@@ -4,11 +4,11 @@
 # during development. However, the information contained may provide compilation
 # tips to users.

-rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
+rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null

 # armv9 needs gcc-13
 # -march-armv9-a includes SVE2 but no crypto
-# -march=armv9-a+crypto adds AES & SHA2 but not SHA512
+# -march=armv9-a+crypto adds AES & SHA256 but not SHA512

 make distclean || echo clean
 rm -f config.status
@@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure  --with-c
 make -j $(nproc)
 mv cpuminer cpuminer-armv9

+# Apple M4: armv9.2, AES, SHA3, SVE2
+make clean || echo clean
+CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-m4
+
+# Apple M2: armv8.6, AES, SHA3
+make clean || echo clean
+CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-m2
+
 # SVE2 available in armv8.5
 make clean || echo clean
 CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2

-# SHA3 available in armv8.4
+# Apple M1: armv8.4 AES, SHA3
 make clean || echo clean
 CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure  --with-curl
 make -j $(nproc)
 mv cpuminer cpuminer-armv8.4-crypto-sha3

+# Cortex-A76 (Orange Pi 5)
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
+make -j $(nproc)
+mv cpuminer cpuminer-armv8-crypto
+
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure  --with-curl
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -34,9 +34,7 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-arrowlake-s

-# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
-# Granitrapids does not build with AVX10, SHA512 or APX.
-# wait for Diamondrapids & gcc-15.
+# Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
@@ -44,13 +42,13 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-graniterapids

-# SHA512 AVX10.1
-#make clean || echo clean
-#rm -f config.status
-#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
-#make -j $(nproc)
-#strip -s cpuminer
-#mv cpuminer cpuminer-avx10_1
+# Graniterapids + SHA512, AVX10.1
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-avx10.1

 # SHA512 AVX10.2
 #make clean || echo clean
@@ -72,6 +70,9 @@ mv cpuminer cpuminer-graniterapids
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
+# zen4 is close enough for older compiler
+#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
+
 make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-zen5
@@ -138,13 +139,21 @@ make -j $(nproc)
 strip -s cpuminer
 mv cpuminer cpuminer-avx

-# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
+# SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl
+make -j $(nproc)
+strip -s cpuminer
+mv cpuminer cpuminer-sse42-aes-sha
+
+# SSE4.2 AES: Intel Westmere, older Pentium & Celeron
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
 make -j $(nproc)
 strip -s cpuminer
-mv cpuminer cpuminer-aes-sse42
+mv cpuminer cpuminer-sse42-aes

 # SSE4.2: Intel Nehalem
 make clean || echo clean
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.

-rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null
+rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null

-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
+rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null

 make distclean > /dev/null
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.6'
-PACKAGE_STRING='cpuminer-opt 25.6'
+PACKAGE_VERSION='25.7'
+PACKAGE_STRING='cpuminer-opt 25.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1431,7 +1431,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
   esac
  cat <<\_ACEOF

@@ -1536,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.6
+cpuminer-opt configure 25.7
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.6, which was
+It was created by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.6'
+ VERSION='25.7'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.6, which was
+This file was extended by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.6
+cpuminer-opt config.status 25.7
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [25.6])
+AC_INIT([cpuminer-opt], [25.7])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6.
+# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='25.6'
-PACKAGE_STRING='cpuminer-opt 25.6'
+PACKAGE_VERSION='25.7'
+PACKAGE_STRING='cpuminer-opt 25.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
+'configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1424,7 +1424,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
   esac
  cat <<\_ACEOF

@@ -1528,7 +1528,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 25.6
+cpuminer-opt configure 25.7
 generated by GNU Autoconf 2.72

 Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 25.6, which was
+It was created by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3768,7 +3768,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='25.6'
+ VERSION='25.7'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7581,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 25.6, which was
+This file was extended by cpuminer-opt $as_me 25.7, which was
 generated by GNU Autoconf 2.72.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7649,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 25.6
+cpuminer-opt config.status 25.7
 configured by $0, generated by GNU Autoconf 2.72,
  with options \\"\$ac_cs_config\\"

--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -921,40 +921,33 @@ out:
   return rc;
 }

-// returns the unit prefix and the hashrate appropriately scaled.
-void scale_hash_for_display ( double* hashrate, char* prefix )
-{
-       if ( *hashrate < 1e4  )    *prefix =  0;
-  else if ( *hashrate < 1e7  )  { *prefix = 'k';  *hashrate /= 1e3;  }
-  else if ( *hashrate < 1e10 )  { *prefix = 'M';  *hashrate /= 1e6;  }  
-  else if ( *hashrate < 1e13 )  { *prefix = 'G';  *hashrate /= 1e9;  }
-  else if ( *hashrate < 1e16 )  { *prefix = 'T';  *hashrate /= 1e12; }
-  else if ( *hashrate < 1e19 )  { *prefix = 'P';  *hashrate /= 1e15; }
-  else if ( *hashrate < 1e22 )  { *prefix = 'E';  *hashrate /= 1e18; }
-  else if ( *hashrate < 1e25 )  { *prefix = 'Z';  *hashrate /= 1e21; } 
-  else                          { *prefix = 'Y';  *hashrate /= 1e24; }
-}
-
+// Does not account for leap years.
 static inline void sprintf_et( char *str, long unsigned int seconds )
 {
-   long unsigned int min = seconds / 60;
-   long unsigned int sec = seconds % 60;
-   long unsigned int hrs = min / 60;
-   
-   if ( unlikely( hrs ) )   
+   long unsigned int minutes = seconds / 60;
+   if ( minutes )
   {
-      long unsigned int days = hrs / 24;
-      long unsigned int years = days / 365;
-      if ( years )      // 0y000d
-         sprintf( str, "%luy%lud", years, years % 365 );
-      else if ( days )  // 0d00h
-         sprintf( str, "%lud%02luh", days, hrs % 24 );
-      else         // 0h00m  
-         sprintf( str, "%luh%02lum", hrs, min % 60 );
+      long unsigned int hours = minutes / 60;
+      if ( hours )
+      {
+         long unsigned int days = hours / 24;
+         if ( days )
+         {
+            long unsigned int years = days / 365;
+            if ( years )   
+               sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d
+            else
+               sprintf( str, "%lud%02luh", days, hours % 24 );  // 0d00h
+         }
+         else
+            sprintf( str, "%luh%02lum", hours, minutes % 60 );  // 0h00m
+      }
+      else
+         sprintf( str, "%lum%02lus", minutes, seconds % 60 );   // 0m00s
   }
-   else         // 0m00s
-      sprintf( str, "%lum%02lus", min, sec );
-}
+   else
+      sprintf( str, "%lus", seconds );   // 0s
+}      

 const long double exp32  = EXP32;                                 // 2**32
 const long double exp48  = EXP32 * EXP16;                         // 2**48
@@ -2833,67 +2826,29 @@ static void show_credits()
 static bool cpu_capability( bool display_only )
 {
     char cpu_brand[0x40];
-     bool cpu_has_sse2     = has_sse2();     // X86_64 only
-     bool cpu_has_ssse3    = has_ssse3();    // X86_64 only
-     bool cpu_has_sse41    = has_sse41();    // X86_64 only
-     bool cpu_has_sse42    = has_sse42();
-     bool cpu_has_avx      = has_avx();
-     bool cpu_has_neon     = has_neon();     // AArch64 
-     bool cpu_has_sve      = has_sve();      // aarch64 only, insignificant
-     bool cpu_has_sve2     = has_sve2();     // AArch64 only
-     bool cpu_has_sme      = has_sme();
-     bool cpu_has_sme2     = has_sme2();  
-     bool cpu_has_avx2     = has_avx2(); 
-     bool cpu_has_avx512   = has_avx512();
-     bool cpu_has_avx10    = has_avx10();
-     bool cpu_has_aes      = has_aes();      // x86_64 or AArch64
-     bool cpu_has_vaes     = has_vaes();     // X86_64 only
-     bool cpu_has_sha256   = has_sha256();   // x86_64 or AArch64
-     bool cpu_has_sha512   = has_sha512();
     bool sw_has_x86_64    = false;
     bool sw_has_aarch64   = false;
     int  sw_arm_arch      = 0;            // AArch64 version
     bool sw_has_neon      = false;        // AArch64
-     bool sw_has_sve       = false;        // AArch64
-     bool sw_has_sve2      = false;        // AArch64
+     bool sw_has_sve       = false;
+     bool sw_has_sve2      = false;
     bool sw_has_sme       = false;  
     bool sw_has_sme2      = false; 
     bool sw_has_sse2      = false;        // x86_64
-     bool sw_has_ssse3     = false;        // x86_64
-     bool sw_has_sse41     = false;        // x86_64
+     bool sw_has_ssse3     = false;
+     bool sw_has_sse41     = false;
     bool sw_has_sse42     = false;
     bool sw_has_avx       = false;
     bool sw_has_avx2      = false;
     bool sw_has_avx512    = false;
     bool sw_has_avx10     = false;
-     bool sw_has_aes       = false;
-     bool sw_has_vaes      = false;
+     bool sw_has_amx       = false;
+     bool sw_has_apx       = false;
+     bool sw_has_aes       = false;        // x86_64 or AArch64
+     bool sw_has_vaes      = false;        // x86_64
     bool sw_has_sha256    = false;        // x86_64 or AArch64
-     bool sw_has_sha512    = false;        // x86_64 or AArch64
-/*
-     set_t algo_features   = algo_gate.optimizations;
-     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
-     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
-     bool algo_has_avx     = set_incl( AVX_OPT,     algo_features );
-     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
-     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
-     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
-     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
-     bool algo_has_sha256  = set_incl( SHA256_OPT,  algo_features );
-     bool algo_has_sha512  = set_incl( SHA512_OPT,  algo_features );
-     bool algo_has_neon    = set_incl( NEON_OPT,    algo_features );
-     bool use_sse2;
-     bool use_sse42;
-     bool use_avx;
-     bool use_avx2;
-     bool use_avx512;
-     bool use_aes;
-     bool use_vaes;
-     bool use_sha256;
-     bool use_sha512;
-     bool use_neon;
-     bool use_none;
-*/
+     bool sw_has_sha512    = false;
+
     #if defined(__x86_64__)
         sw_has_x86_64 = true;
     #elif defined(__aarch64__)
@@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only )
     #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
         sw_has_avx512 = true;
     #endif
-// AVX10 version is not significant as of AVX10.2. If that changes use a better
-// way to test the version than sequentially.
-//     #if defined(__AVX10_2__)
-// 
-//     #elif defined(__AVX10_1__)
-     #if defined(__AVX10_1__)
+     #if defined(__AVX10_1__)    // version is not significant
         sw_has_avx10 = true;
     #endif
+     #ifdef __AMX_TILE__
+         sw_has_amx = true;
+     #endif
+     #ifdef __APX_F__
+         sw_has_apx = true;
+     #endif

     // x86_64 or AArch64 
     #if defined(__AES__) || defined(__ARM_FEATURE_AES)
@@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only )
     #if defined(__ARM_NEON)
         sw_has_neon = true;
     #endif
+     // FYI, SVE & SME not used by cpuminer
     #if defined(__ARM_FEATURE_SVE)
         sw_has_sve = true;
     #endif
@@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only )
     // Build
     printf( "SW built on " __DATE__
     #if defined(__clang__)
-        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
-                                __clang_patchlevel__ );
+        " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ );
     #elif defined(__GNUC__)
        " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
     #endif
@@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only )
     printf("CPU features: ");
     if ( cpu_arch_x86_64()  )
     {
-       if      ( cpu_has_avx10  )    printf( " AVX10.%d", avx10_version() );
-       if      ( cpu_has_avx512 )    printf( " AVX512" );
-       else if ( cpu_has_avx2   )    printf( " AVX2  " );
-       else if ( cpu_has_avx    )    printf( " AVX   " );
-       else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-       else if ( cpu_has_sse41  )    printf( " SSE4.1" );
-       else if ( cpu_has_ssse3  )    printf( " SSSE3 " );
-       else if ( cpu_has_sse2   )    printf( " SSE2  " );
+       if      ( has_avx10()  )    printf( " AVX10.%d", avx10_version() );
+       else if ( has_avx512() )    printf( " AVX512" );
+       else if ( has_avx2()   )    printf( " AVX2  " );
+       else if ( has_avx()    )    printf( " AVX   " );
+       else if ( has_sse42()  )    printf( " SSE4.2" );
+       else if ( has_sse41()  )    printf( " SSE4.1" );
+       else if ( has_ssse3()  )    printf( " SSSE3 " );
+       else if ( has_sse2()   )    printf( " SSE2  " );
+       if      ( has_amx()    )    printf( " AMX"    );
+       if      ( has_apx_f()  )    printf( " APX"    );
+
     }
     else if   ( cpu_arch_aarch64() )
     {
-       if      ( cpu_has_neon   )    printf( "       NEON" );
-       if      ( cpu_has_sve2   )    printf( " SVE2-%d", sve_vector_length() );
-       else if ( cpu_has_sve    )    printf( " SVE"    );
-       if      ( cpu_has_sme2   )    printf( " SME2"   );
-       else if ( cpu_has_sme    )    printf( " SME"    );
+       if      ( has_neon()   )    printf( "       NEON" );
+       if      ( has_sve2()   )    printf( " SVE2-%d", sve_vector_length() );
+       else if ( has_sve()    )    printf( " SVE"    );
+       if      ( has_sme2()   )    printf( " SME2"   );
+       else if ( has_sme()    )    printf( " SME"    );
     }     
-     if        ( cpu_has_vaes   )    printf( " VAES"   );
-     else if   ( cpu_has_aes    )    printf( "  AES"   );
-     if        ( cpu_has_sha512 )    printf( " SHA512" );
-     else if   ( cpu_has_sha256 )    printf( " SHA256" );
+     if        ( has_vaes()   )    printf( " VAES"   );
+     else if   ( has_aes()    )    printf( "  AES"   );
+     if        ( has_sha512() )    printf( " SHA512" );
+     else if   ( has_sha256() )    printf( " SHA256" );

     printf("\nSW features:  ");
     if ( sw_has_x86_64 )
@@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only )
        else if ( sw_has_sse41     ) printf( " SSE4.1" );
        else if ( sw_has_ssse3     ) printf( " SSSE3 " );
        else if ( sw_has_sse2      ) printf( " SSE2  " );
+        if      ( sw_has_amx       ) printf( " AMX"    );
+        if      ( sw_has_apx       ) printf( " APX"    );
     }
     else if    ( sw_has_aarch64 ) 
     {
--- a/miner.h
+++ b/miner.h
@@ -542,7 +542,9 @@ void applog_hash(void *hash);
 void format_hashrate(double hashrate, char *output);
 void print_hash_tests(void);

+// Factors of 1000 used for hashes, ie kH/s, Mh/s.
 void scale_hash_for_display ( double* hashrate, char* units );
+// Factors of 1024 used for bytes, ie kiB, MiB.
 void format_number_si( double* hashrate, char* si_units );
 void report_summary_log( bool force );

--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -447,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define v128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )
+#define v128_nxor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )

 #else

@@ -469,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )

-#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )
+#define v128_nxor( a, b )         v128_not( _mm_xor_si128( a, b ) )

 #endif

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_orand( a, b, c )     _mm256_ternarylogic_epi64( a, b, c, 0xf8 )

 // ~( a ^ b ), same as (~a) ^ b
-#define mm256_xnor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
+#define mm256_nxor( a, b )         _mm256_ternarylogic_epi64( a, b, b, 0x81 )
    
 #else

@@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v )
 #define mm256_orand( a, b, c ) \
  _mm256_or_si256( a, _mm256_and_si256( b, c ) )

-#define mm256_xnor( a, b ) \
+#define mm256_nxor( a, b ) \
  mm256_not( _mm256_xor_si256( a, b ) )

 #endif
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -18,8 +18,13 @@

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
-//    This removes the need for an explicit movemask instruction.
+//    "_mm512_cmp" instructions now return a bitmask instead of a vector mask.
+//    This removes the need for an explicit movemask instruction. It is also
+//    slower than AVX2 cmp. There is no version of AVX512 cmp that returns a
+//    vector result resulting in a double penalty if a vector result is needed:
+//    slower cmp instruction & extra instruction to convert bit mask into 
+//    vector mask. 256 bit & 128 bit still have legacy cmp returning vector
+//    while AVX512VL adds masked versions returning bit mask.
 //
 //    Many previously sizeless (si) instructions now have sized (epi) versions
 //    to accomodate masking packed elements.
@@ -30,7 +35,7 @@
 //    list.
 //
 //    "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
-//    AVX512 permutes can cross all lanes.
+//    AVX512 instructions using the permute name can cross all lanes.
 //
 //    New alignr instructions for epi64 and epi32 operate across the entire
 //    vector but slower than epi8 which continues to be restricted to 128 bit
@@ -50,16 +55,17 @@
 //        parentheses to ensure the expression argument is evaluated first.
 //      - if an argument is to referenced multiple times a C inline function
 //        should be used instead of a macro to prevent an expression argument
-//        from being evaluated multiple times (wasteful) or produces side
+//        from being evaluated multiple times (wasteful) or produce side
 //        effects (very bad).
 //
 //    There are 2 areas where overhead is a major concern: constants and
 //    permutations.
 //
 //    Constants need to be composed at run time by assembling individual
-//    elements, very expensive. The cost is proportional to the number of
-//    different elements therefore use the largest element size possible,
-//    merge smaller integer elements to 64 bits, and group repeated elements.
+//    elements or loaded from memory, very expensive. The cost of runtime
+//    construction is proportional to the number of different elements
+//    therefore use the largest element size possible merging smaller integer
+//    elements to 64 bits, and group repeated elements.
 //
 //    Constants with repeating patterns can be optimized with the smaller
 //    patterns repeated more frequently being more efficient.
@@ -67,14 +73,15 @@
 //    Some specific constants can be very efficient. Zero is very efficient,
 //    1 and -1 slightly less so. 
 //
-//    If an expensive constant is to be reused in the same function it should
-//    be declared as a local variable defined once and reused.
+//    If an expensive constant is to be reused in the same function it may
+//    be declared as a local variable defined once and reused. If frequently
+//    used it can be declared as a static const in memory.
 //
 //    Permutations can be very expensive if they use a vector control index,
-//    even if the permutation itself is quite efficient.
-//    The index is essentially a constant with all the baggage that brings.
-//    The same rules apply, if an index is to be reused it should be defined
-//    as a local. This applies specifically to bswap operations.
+//    even if the permute instruction itself is quite efficient.
+//    The index is essentially a vector constant with all the baggage that
+//    brings. The same rules apply, if an index is to be reused it should either
+//    be defined as a local or static const.
 //
 //    Permutations that cross 128 bit lanes are typically slower and often need
 //    a vector control index. If the permutation doesn't need to cross 128 bit
@@ -221,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_nor( a, b )          _mm512_ternarylogic_epi64( a, b, b, 0x01 )

 // ~( a ^ b ),  (~a) ^ b
-#define mm512_xnor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )
+#define mm512_nxor( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0x81 )

 // ~( a & b )
 #define mm512_nand( a, b )         _mm512_ternarylogic_epi64( a, b, b, 0xef )
@@ -241,6 +248,15 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

+/* not used
+#if defined(__AVX512VBMI2__)
+
+#define mm512_ror_16( v, c )   _mm512_shrdi_epi16( c, v, v )
+#define mm512_rol_16( v, c )   _mm512_shldi_epi16( c, v, v )
+
+#endif
+*/
+
 //
 // Reverse byte order of packed elements, vectorized endian conversion.

@@ -249,9 +265,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_bswap_32( v )  _mm512_shuffle_epi8( v, V512_BSWAP32 )

 /* not used
+#if defined(__AVX512VBMI2__)
+
+#define mm512_bswap_16( v )  mm512_ror_16( v, 8 )
+
+#else
+
 #define mm512_bswap_16( v ) \
   _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
-                              0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+                                0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
+
+#endif
 */

 #define mm512_bswap_16( v ) \
@@ -431,8 +455,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
                                           _mm512_castsi512_ps( v2 ), c ) ); 

 // 64 bit lanes
-// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
-
+// Redundant with ror & rol but included for consistency with AVX2/SSE.
 #define mm512_qrev32( v )       _mm512_shuffle_epi32( v, 0xb1 )
 #define mm512_swap64_32         mm512_qrev32        // grandfathered

--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -126,7 +126,7 @@
 #define v128_andnot( v1, v0 )         vbicq_u32( v0, v1 )

 // ~( v1 ^ v0 ), same as (~v1) ^ v0
-#define v128_xnor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )
+#define v128_nxor( v1, v0 )           v128_not( v128_xor( v1, v0 ) )

 // ~v1 | v0,  args reversed for consistency with x86_64
 #define v128_ornot( v1, v0 )          vornq_u32( v0, v1 )
@@ -349,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
     vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )

 // Bit rotation
-/*
-#define v128_shuflr64_8( v )        v128_shuffle8( v, V128_SHUFLR64_8 )
-#define v128_shufll64_8( v )        v128_shuffle8( v, V128_SHUFLL64_8 )
-#define v128_shuflr64_16(v )        v128_shuffle8( v, V128_SHUFLR64_16 )
-#define v128_shufll64_16(v )        v128_shuffle8( v, V128_SHUFLL64_16 )
-#define v128_shuflr64_24(v )        v128_shuffle8( v, V128_SHUFLR64_24 )
-#define v128_shufll64_24(v )        v128_shuffle8( v, V128_SHUFLL64_24 )
-#define v128_shuflr32_8( v )        v128_shuffle8( v, V128_SHUFLR32_8 )
-#define v128_shufll32_8( v )        v128_shuffle8( v, V128_SHUFLL32_8 )
-
-#define v128_ror64( v, c ) \
-   ( (c) ==  8 ) ? v128_shuflr64_8( v ) \
- : ( (c) == 16 ) ? v128_shuflr64_16( v ) \
- : ( (c) == 24 ) ? v128_shuflr64_24( v ) \
- : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
- : ( (c) == 40 ) ? v128_shufll64_24( v ) \
- : ( (c) == 48 ) ? v128_shufll64_16( v ) \
- : ( (c) == 56 ) ? v128_shufll64_8( v ) \
- :                 vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
-                                ((uint64x2_t)(v)), c )
-
-#define v128_rol64( v, c ) \
-   ( (c) ==  8 ) ? v128_shufll64_8( v ) \
- : ( (c) == 16 ) ? v128_shufll64_16( v ) \
- : ( (c) == 24 ) ? v128_shufll64_24( v ) \
- : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
- : ( (c) == 40 ) ? v128_shuflr64_24( v ) \
- : ( (c) == 48 ) ? v128_shuflr64_16( v ) \
- : ( (c) == 56 ) ? v128_shuflr64_8( v ) \
- :                 vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
-                                ((uint64x2_t)(v)), c )
-
-#define v128_ror32( v, c ) \
-   ( (c) ==  8 ) ? v128_shuflr32_8( v ) \
- : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
- : ( (c) == 24 ) ? v128_shufll32_8( v ) \
- :                 vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
-                                ((uint32x4_t)(v)), c )
-
-#define v128_rol32( v, c ) \
-   ( (c) ==  8 ) ? v128_shufll32_8( v ) \
- : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
- : ( (c) == 24 ) ? v128_shuflr32_8( v ) \
- :                 vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
-                                ((uint32x4_t)(v)), c )
-*/

 #define v128_ror64( v, c ) \
  ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -474,6 +474,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 }
 
 /*
+// ARM feature compiler flags
 #ifdef __aarch64__
 #warning "__aarch64__"
 #endif
@@ -509,16 +510,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #endif
 */

-// Typical display format: AVX10.[version]_[vectorlength], if vector length is
-// omitted 256 is the default.
-//    Ex: AVX10.1_512
-// Flags:
-// AVX10  128  256  512
-//   0     0    0    0    = AVX10 not supported
-//   1     1    1    0    = AVX10 256 bit max  (version 2)
-//   1     1    1    1    = AVX10 512 bit max  (version 1 granite rapids)
-// Other combinations are not defined.
-
 static inline bool cpu_arch_x86_64()
 {
 #if defined(__x86_64__)
@@ -766,6 +757,17 @@ static inline bool has_vbmi2()
 #endif
 }

+static inline bool has_amx()
+{
+#if defined(__x86_64__)
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EDX_Reg ] & AMX_TILE_Flag;
+#else
+   return false;
+#endif
+}
+
 static inline bool has_aes()
 {
 #if defined(__x86_64__)
@@ -815,12 +817,9 @@ static inline bool has_sveaes()
 static inline bool has_sha256()
 {
 #if defined(__x86_64__)
-   if ( has_avx() )
-   {
      unsigned int cpu_info[4] = { 0 };
      cpuid( EXTENDED_FEATURES, 0, cpu_info );
      return cpu_info[ EBX_Reg ] & SHA_Flag;
-   }
   return false;
 #elif defined(__aarch64__) && defined(HWCAP_SHA2)
   // NEON SHA256
@@ -835,7 +834,7 @@ static inline bool has_sha256()
 static inline bool has_sha512()
 {
 #if defined(__x86_64__)
-   if ( has_avx2() )
+   if ( has_avx() )
   {
      unsigned int cpu_info[4] = { 0 };
      cpuid( EXTENDED_FEATURES, 1, cpu_info );
@@ -852,7 +851,6 @@ static inline bool has_sha512()
 #endif
 }

-// Arm only
 static inline bool has_sha3()
 {
 #if defined(__aarch64__) && defined(HWCAP_SHA3)
@@ -944,16 +942,6 @@ static inline int sve_vector_length()
   return 0;
 }

-// Assume min_vlen refers to the register size
-static inline int rvv_vector_length()
-{
-#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
-   return __riscv_v_min_vlen;
-#endif
-   return 0;
-}
-
-// generic
 static inline int vector_length()
 {
 #if defined(__x86_64__)
@@ -965,8 +953,8 @@ static inline int vector_length()
   return has_sve()    ? sve_vector_length()
        : has_neon()   ? 128
        :                0;
-#elif defined(__riscv) && defined(__riscv_vector)
-   return rvv_vector_length();
+#elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
+   return __riscv_v_min_vlen;
 #endif
   return 0;
 }
--- a/util.c
+++ b/util.c
@@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
 	free(cmd);
 }

-
-void format_hashrate(double hashrate, char *output)
+// Decimal SI, factors 0f 1000
+void scale_hash_for_display ( double* hashrate, char* prefix )
 {
-	char prefix = '\0';
-
-	if (hashrate < 10000) {
-		// nop
-	}
-	else if (hashrate < 1e7) {
-		prefix = 'k';
-		hashrate *= 1e-3;
-	}
-	else if (hashrate < 1e10) {
-		prefix = 'M';
-		hashrate *= 1e-6;
-	}
-	else if (hashrate < 1e13) {
-		prefix = 'G';
-		hashrate *= 1e-9;
-	}
-	else {
-		prefix = 'T';
-		hashrate *= 1e-12;
-	}
-
-	sprintf(
-		output,
-		prefix ? "%.2f %cH/s" : "%.2f H/s%c",
-		hashrate, prefix
-	);
+       if ( *hashrate < 1e4  )    *prefix =  0;
+  else if ( *hashrate < 1e7  )  { *prefix = 'k';  *hashrate /= 1e3;  }
+  else if ( *hashrate < 1e10 )  { *prefix = 'M';  *hashrate /= 1e6;  }
+  else if ( *hashrate < 1e13 )  { *prefix = 'G';  *hashrate /= 1e9;  }
+  else if ( *hashrate < 1e16 )  { *prefix = 'T';  *hashrate /= 1e12; }
+  else if ( *hashrate < 1e19 )  { *prefix = 'P';  *hashrate /= 1e15; }
+  else if ( *hashrate < 1e22 )  { *prefix = 'E';  *hashrate /= 1e18; }
+  else if ( *hashrate < 1e25 )  { *prefix = 'Z';  *hashrate /= 1e21; }
+  else                          { *prefix = 'Y';  *hashrate /= 1e24; }
 }

-// For use with MiB etc
+void format_hashrate( double hashrate, char *output )
+{
+	char prefix = '\0';
+   scale_hash_for_display( &hashrate, &prefix );
+	sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix	);
+}
+
+// Binary SI, factors of 1024
 void format_number_si( double* n, char* si_units )
 {
  if ( *n < 1024*10 )  {  *si_units = 0;   return;  }
--- a/verthash-help.txt
+++ b/verthash-help.txt
@@ -55,9 +55,9 @@ the current directory it will be created.

 Data file creation can take up to 30 minutes on a spinning hard drive. 
 Once created the new data file will be verified and used immediately
-if a valid url and user were included on the command line.
+if a valid url and user was included on the command line.

-A default data file can be created by ommitting the url option. That will
+A default data file can also be created by ommitting the url option. That will
 either verify an existing default data file or create one and verify it,
 then exit.