v23.15

2025-09-17 23:44:27 +00:00 · 2023-11-30 14:36:47 -05:00
parent 4e3f1b926f
commit 9d3a46c355
29 changed files with 3081 additions and 2234 deletions
--- a/6
+++ b/6
@@ -75,6 +75,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v23.15
+
+Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
+ARM: Fugue AES optimizations enabled.
+ARM: quark, qubit, x11gost algos optimized with NEON & AES.
+
 v23.14

 ARM: Groestl AES optimizations enabled.
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -15,237 +15,176 @@
 *
 */

-#if defined(__AES__)
-
-#include <x86intrin.h>
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

 #include <memory.h>
 #include "fugue-aesni.h"

+static const v128u64_t _supermix1a	__attribute__ ((aligned (16))) =
+   { 0x0202010807020100, 0x0a05000f06010c0b };

-MYALIGN const unsigned long long _supermix1a[]	= {0x0202010807020100, 0x0a05000f06010c0b};
-MYALIGN const unsigned long long _supermix1b[]	= {0x0b0d080703060504, 0x0e0a090c050e0f0a};
-MYALIGN const unsigned long long _supermix1c[]	= {0x0402060c070d0003, 0x090a060580808080};
-MYALIGN const unsigned long long _supermix1d[]	= {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
-MYALIGN const unsigned long long _supermix2a[]	= {0x07020d0880808080, 0x0b06010c050e0f0a};
-MYALIGN const unsigned long long _supermix4a[]	= {0x000f0a050c0b0601, 0x0302020404030e09};
-MYALIGN const unsigned long long _supermix4b[]	= {0x07020d08080e0d0d, 0x07070908050e0f0a};
-MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x0302000007060504};
-MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
-MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
-//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
-//MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
-//MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
-//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
-//MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
-MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
+static const v128u64_t _supermix1b	__attribute__ ((aligned (16))) =
+   { 0x0b0d080703060504, 0x0e0a090c050e0f0a };

+static const v128u64_t _supermix1c	__attribute__ ((aligned (16))) =
+   { 0x0402060c070d0003, 0x090a060580808080 };

-MYALIGN const unsigned int _IV512[] = {		
-	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
+static const v128u64_t _supermix1d	__attribute__ ((aligned (16))) =
+   { 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
+
+static const v128u64_t _supermix2a	__attribute__ ((aligned (16))) =
+   { 0x07020d0880808080, 0x0b06010c050e0f0a };
+
+static const v128u64_t _supermix4a	__attribute__ ((aligned (16))) =
+   { 0x000f0a050c0b0601, 0x0302020404030e09 };
+
+static const v128u64_t _supermix4b	__attribute__ ((aligned (16))) =
+   { 0x07020d08080e0d0d, 0x07070908050e0f0a };
+
+static const v128u64_t _supermix4c	__attribute__ ((aligned (16))) =
+   { 0x0706050403020000, 0x0302000007060504 };
+
+static const v128u64_t _supermix7a	__attribute__ ((aligned (16))) =
+   { 0x010c0b060d080702, 0x0904030e03000104 };
+
+static const v128u64_t _supermix7b	__attribute__ ((aligned (16))) =
+   { 0x8080808080808080, 0x0504070605040f06 };
+
+static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
+   { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
+
+static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
+   { 0x000000001b1b0000, 0x0000000000000000 };
+
+static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
+   { 0x000000002d361b00, 0x0000000000000000 };
+
+static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
+   { 0x0303030303030303, 0x0303030303030303 };
+
+static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
+ {	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
 	0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
 	0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
 	0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
 	0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
-	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
+	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
+ };

-#if defined(__SSE4_1__)
+#if defined(__ARM_NEON)

-#define PACK_S0(s0, s1, t1)\
-   s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
+#define mask_1000(v)         v128_put32( v, 0, 3 )

-#define UNPACK_S0(s0, s1, t1)\
-   s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
-   s0 = mm128_mask_32( s0, 8 )
+static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = s1;\
-   t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1);
+static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };

-#else   // SSE2
+static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };

-#define PACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
-   s0 = _mm_xor_si128(s0, t1);
+static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };

-#define UNPACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
-   s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
-   s0 = mm128_mask_32( s0, 8 )
+#define shuffle_3303(v)      vqtbl1q_u8( v, MASK_3303 )
+#define shuffle_0321(v)      vqtbl1q_u8( v, MASK_0321 )

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = _mm_shuffle_epi32(s1, 0xf9);\
-   t2 = _mm_shuffle_epi32(s2, 0xcf);\
-   t1 = _mm_xor_si128(t1, t2);\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1)
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = vqtbl1q_u8( s1, MASK_3321 ); \
+   t2 = vqtbl1q_u8( s2, MASK_3033 ); \
+   t1 = v128_xor( t1, t2 ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );
+
+#elif defined(__SSE4_1__)
+
+#define mask_1000(v)         v128_mask32( v, 8 )
+
+#define shuffle_3303(v)      _mm_shuffle_epi32( v, 0xf3 )
+#define shuffle_0321(v)      _mm_shuffle_epi32( v, 0x39 )
+
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = s1; \
+   t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );

 #endif

-#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s10 = _mm_xor_si128(s10, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1)
+#define PACK_S0( s0, s1, t1 ) \
+ s0 = v128_movlane32( s0, 3, s1, 0 )

-
-#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s16 = _mm_xor_si128(s16, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1)
+#define UNPACK_S0( s0, s1, t1 ) \
+   s1 = v128_movlane32( s1, 0, s0, 3 ); \
+   s0 = mask_1000( s0 )

 #define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s22 = _mm_xor_si128(s22, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s7 = _mm_xor_si128(s7, t1)
+	t1 = shuffle_3303( s0 ); \
+	s22 = v128_xor(s22, t1);\
+	t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
+	s0 = v128_movlane32( s0, 0, t1, 0 ); \
+	t1 = v128_alignr64( t1, v128_zero, 1 ); \
+	s8 = v128_xor(s8, t1);\
+	t1 = shuffle_3303( s24 ); \
+	s0 = v128_xor(s0, t1);\
+	t1 = shuffle_3303( s27 ); \
+	s4 = v128_xor(s4, t1);\
+	t1 = shuffle_3303( s30 ); \
+	s7 = v128_xor(s7, t1)

-#define PRESUPERMIX(t0, t1, t2, t3, t4)\
-   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
-
-/*
-#define PRESUPERMIX(x, t1, s1, s2, t2)\
-	s1 = x;\
-	s2 = _mm_add_epi8(x, x);\
-	t2 = _mm_add_epi8(s2, s2);\
-	t1 = _mm_srli_epi16(x, 6);\
-	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
-*/
-
-#define SUBSTITUTE(r0, _t2 )\
-	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
-	_t2 = _mm_aesenclast_si128( _t2, v128_zero )
+#define SUBSTITUTE( r0, _t2 ) \
+	_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
+	_t2 = v128_aesenclast_nokey( _t2 )

 #define SUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
-   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-   t2 = v128_xor3(t2, t3, t0 );\
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
+   t3 = v128_add8( t0, t0 ); \
+   t4 = v128_add8( t3, t3 ); \
+   t1 = v128_sr16( t0, 6 ); \
+   t1 = v128_and( t1, _lsbmask2 ); \
+   t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
+   t4 = v128_shuffle8( t2, _supermix1b ); \
+   t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
+   t1 = v128_shuffle8( t4, _supermix1c ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t4, _supermix1d ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t2, _supermix1a ); \
+   t2 = v128_xor3( t2, t3, t0 ); \
+   t2 = v128_shuffle8( t2, _supermix7a ); \
   t4 = v128_xor3( t4, t1, t2 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
+   t2 = v128_shuffle8( t2, _supermix7b ); \
+   t3 = v128_shuffle8( t3, _supermix2a ); \
+   t1 = v128_shuffle8( t0, _supermix4a ); \
+   t0 = v128_shuffle8( t0, _supermix4b ); \
   t4 = v128_xor3( t4, t2, t1 ); \
-   t0 = _mm_xor_si128(t0, t3);\
-   t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
-
-/*
-#define SUPERMIX(t0, t1, t2, t3, t4)\
-	PRESUPERMIX(t0, t1, t2, t3, t4);\
-	POSTSUPERMIX(t0, t1, t2, t3, t4)
-*/
-
-#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-	t4 = t1;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t2 = v128_xor3(t2, t3, t0 );\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
-	t0 = _mm_xor_si128(t0, t3);\
-	t4 = _mm_xor_si128(t4, t0);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
-	t4 = _mm_xor_si128(t4, t0)
-
-#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
-	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
-	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
-	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
-	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	UNPACK_S0(r3c, r3a, _t3)
+   t0 = v128_xor( t0, t3 ); \
+   t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );

 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
+	_t0 = shuffle_0321( r1c ); \
+	r2c = v128_xor(r2c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r2d = v128_xor(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
+	_t0 = shuffle_0321( r2c ); \
+	r3c = v128_xor(r3c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r3d = v128_xor(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
-	r4c = _mm_xor_si128(r4c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r4d = _mm_xor_si128(r4d, _t0);\
+	_t0 = shuffle_0321( r3c ); \
+	r4c = v128_xor(r4c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r4d = v128_xor(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
 	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
 	block[1] = col[(base + a + 1) % s];\
 	block[2] = col[(base + a + 2) % s];\
 	block[3] = col[(base + a + 3) % s];\
-	x = _mm_load_si128((__m128i*)block)
+	x = v128_load( (v128_t*)block )

 #define STORECOLUMN(x, s)\
-	_mm_store_si128((__m128i*)block, x);\
+	v128_store((v128_t*)block, x );\
 	col[(base + 0) % s] = block[0];\
 	col[(base + 1) % s] = block[1];\
 	col[(base + 2) % s] = block[2];\
 	col[(base + 3) % s] = block[3]

-void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
+                  unsigned int uBlockCount )
 {
-   __m128i _t0, _t1, _t2, _t3;
+   v128_t _t0, _t1, _t2, _t3;

   switch(ctx->base)
   {
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      pmsg += 4;
      uBlockCount--;
   }
-
 }

-void Final512(hashState_fugue *ctx, BitSequence *hashval)
+void Final512( hashState_fugue *ctx, uint8_t *hashval )
 {
   unsigned int block[4] __attribute__ ((aligned (32)));
   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
-	__m128i r0, _t0, _t1, _t2, _t3;
+	v128_t r0, _t0, _t1, _t2, _t3;

-	for(i = 0; i < 12; i++)
+	for( i = 0; i < 12; i++ )
 	{
-		_mm_store_si128((__m128i*)block, ctx->state[i]);
+		v128_store( (v128_t*)block, ctx->state[i] );

 		col[3 * i + 0] = block[0];
 		col[3 * i + 1] = block[1];
 		col[3 * i + 2] = block[2];
 	}

-	base = (36 - (12 * ctx->base)) % 36;
+	base = ( 36 - (12 * ctx->base) ) % 36;

-	for(i = 0; i < 32; i++)
+	for( i = 0; i < 32; i++ )
 	{
 		// ROR3
 		base = (base + 33) % 36;

 		// CMIX
-		col[(base +  0) % 36] ^= col[(base + 4) % 36];
-		col[(base +  1) % 36] ^= col[(base + 5) % 36];
-		col[(base +  2) % 36] ^= col[(base + 6) % 36];
-		col[(base +  18) % 36] ^= col[(base + 4) % 36];
-		col[(base +  19) % 36] ^= col[(base + 5) % 36];
-		col[(base +  20) % 36] ^= col[(base + 6) % 36];
+		col[ (base +  0) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base +  1) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base +  2) % 36 ] ^= col[ (base + 6) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

-	for(i = 0; i < 13; i++)
+	for( i = 0; i < 13; i++ )
 	{
 		// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base +  9) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 28) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR8
 		base = (base + 28) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

 	// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-	col[(base +  4) % 36] ^= col[(base + 0) % 36];
-	col[(base +  9) % 36] ^= col[(base + 0) % 36];
-	col[(base + 18) % 36] ^= col[(base + 0) % 36];
-	col[(base + 27) % 36] ^= col[(base + 0) % 36];
+	col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 	// Transform to the standard basis and store output; S1 || S2 || S3 || S4
-	LOADCOLUMN(r0, 36, 1);
-	_mm_store_si128((__m128i*)hashval, r0);
+	LOADCOLUMN( r0, 36, 1 );
+	v128_store( (v128_t*)hashval, r0 );

 	// Transform to the standard basis and store output; S9 || S10 || S11 || S12
-	LOADCOLUMN(r0, 36, 9);
-	_mm_store_si128((__m128i*)hashval + 1, r0);
+	LOADCOLUMN( r0, 36, 9 );
+	v128_store( (v128_t*)hashval + 1, r0 );

 	// Transform to the standard basis and store output; S18 || S19 || S20 || S21
-	LOADCOLUMN(r0, 36, 18);
-	_mm_store_si128((__m128i*)hashval + 2, r0);
+	LOADCOLUMN( r0, 36, 18 );
+	v128_store( (v128_t*)hashval + 2, r0 );

 	// Transform to the standard basis and store output; S27 || S28 || S29 || S30
-	LOADCOLUMN(r0, 36, 27);
-	_mm_store_si128((__m128i*)hashval + 3, r0);
+	LOADCOLUMN( r0, 36, 27 );
+	v128_store( (v128_t*)hashval + 3, r0 );
 }

-HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
+int fugue512_Init( hashState_fugue *ctx, int nHashSize )
 {
 	int i;
 	ctx->processed_bits = 0;
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
 	for(i = 0; i < 6; i++)
 		ctx->state[i] = v128_zero;

-	ctx->state[6]  = _mm_load_si128((__m128i*)_IV512 + 0);
-	ctx->state[7]  = _mm_load_si128((__m128i*)_IV512 + 1);
-	ctx->state[8]  = _mm_load_si128((__m128i*)_IV512 + 2);
-	ctx->state[9]  = _mm_load_si128((__m128i*)_IV512 + 3);
-	ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
-	ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
+	ctx->state[6]  = casti_v128( _IV512, 0 );
+	ctx->state[7]  = casti_v128( _IV512, 1 );
+	ctx->state[8]  = casti_v128( _IV512, 2 );
+	ctx->state[9]  = casti_v128( _IV512, 3 );
+	ctx->state[10] = casti_v128( _IV512, 4 );
+	ctx->state[11] = casti_v128( _IV512, 5 );

-	return SUCCESS;
+	return 0;
 }

-
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
+int fugue512_Update( hashState_fugue *state, const void *data,
+                            uint64_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		if(state->uBufferBytes != 0)
 		{
 			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+			memcpy( state->buffer + state->uBufferBytes, (void*)data,
+                 state->uBlockLength - state->uBufferBytes );

 			// Process the buffer
 			Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		state->uBufferBytes += uByteLength;
 	}

-	return SUCCESS;
+	return 0;
 }

-HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
+int fugue512_Final( hashState_fugue *state, void *hashval )
 {
 	unsigned int i;
-	BitSequence lengthbuf[8] __attribute__((aligned(64)));
+	uint8_t lengthbuf[8] __attribute__((aligned(64)));

 	// Update message bit count
 	state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
 	// Finalization
 	Final512(state, hashval);

-	return SUCCESS;
+	return 0;
 }


-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
+                   uint64_t databitlen )
 {
-	fugue512_Init(hs, 512);
-	fugue512_Update(hs, data, databitlen*8);
-	fugue512_Final(hs, hashval);
-	return SUCCESS;
+	fugue512_Init( hs, 512 );
+	fugue512_Update( hs, data, databitlen*8 );
+	fugue512_Final( hs, hashval );
+	return 0;
 }

 #endif  // AES
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -14,37 +14,31 @@
 #ifndef FUGUE_HASH_API_H
 #define FUGUE_HASH_API_H

-#if defined(__AES__) 
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

-#if !defined(__SSE4_1__)
-#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
-#endif
-
-#include "compat/sha3_common.h"
 #include "simd-utils.h"

-
 typedef struct
 {
-	__m128i			state[12];
+	v128_t			state[12];
 	unsigned int	base;
-
 	unsigned int	uHashSize;
 	unsigned int	uBlockLength;
 	unsigned int	uBufferBytes;
-	DataLength		processed_bits;
-	BitSequence		buffer[4];
+	uint64_t 		processed_bits;
+	uint8_t  		buffer[4];

 } hashState_fugue __attribute__ ((aligned (64)));


 // These functions are deprecated, use the lower case macro aliases that use
 // the standard interface. This will be cleaned up at a later date.
-HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
+int fugue512_Init( hashState_fugue *state, int hashbitlen );

-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
+int fugue512_Update( hashState_fugue *state, const void *data,
+                     uint64_t databitlen );

-HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
+int fugue512_Final( hashState_fugue *state, void *hashval );

 #define fugue512_init( state ) \
   fugue512_Init( state, 512 )
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
   fugue512_Final


-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
+                   uint64_t databitlen);

 #endif // AES
 #endif // HASH_API_H
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
                                           casti_m256i( b, 0 ) );
   casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
                                           casti_m256i( b, 1 ) );
-#elif defined(__SSE2__)
-   casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
-                                        casti_m128i( b, 0 ) );
-   casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
-                                        casti_m128i( b, 1 ) );
-   casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
-                                        casti_m128i( b, 2 ) );
-   casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
-                                        casti_m128i( b, 3 ) );
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+   casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
+                                  casti_v128( b, 0 ) );
+   casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
+                                  casti_v128( b, 1 ) );
+   casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
+                                  casti_v128( b, 2 ) );
+   casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
+                                  casti_v128( b, 3 ) );
 #else
   const unsigned long long *A=a, *B=b;
 	unsigned long long *C=c;
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -6,7 +6,7 @@
 #include <stdint.h>
 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
@@ -35,7 +35,7 @@ union _hmq1725_ctx_holder
 {
   blake512_context        blake;
   sph_bmw512_context      bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_fugue         fugue;
 #else
   sph_fugue512_context    fugue;
@@ -177,7 +177,7 @@ extern void hmq1725hash(void *state, const void *input)
    sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
    sph_hamsi512_close( &ctx.hamsi, hashB ); //4

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
    sph_fugue512_init( &ctx.fugue );
@@ -208,7 +208,7 @@ extern void hmq1725hash(void *state, const void *input)

    if ( hashB[0] & mask ) //7
    {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
       fugue512_full( &ctx.fugue, hashA, hashB, 64 );
 #else
       sph_fugue512_init( &ctx.fugue );
@@ -259,30 +259,18 @@ extern void hmq1725hash(void *state, const void *input)
 int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-//        uint32_t endiandata[32] __attribute__((aligned(64)));
-        uint32_t endiandata[20] __attribute__((aligned(32)));
-        uint32_t hash64[8] __attribute__((aligned(32)));
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t endiandata[20] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
   int thr_id = mythr->id;  // thr_id arg is deprecated
-	//const uint32_t Htarg = ptarget[7];

 	//we need bigendian data...
-//        for (int k = 0; k < 32; k++)
-        for (int k = 0; k < 20; k++)
-                be32enc(&endiandata[k], pdata[k]);
+   for (int k = 0; k < 20; k++)
+         be32enc(&endiandata[k], pdata[k]);

-//        hmq_bmw512_midstate( endiandata );
-
-//	if (opt_debug) 
-//	{
-//		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
-//	}
-	
-	/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
-	/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
 	if (ptarget[7]==0) {
 		do {
 			pdata[19] = ++n;
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  return true;
 };

--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -7,12 +7,12 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
@@ -21,9 +21,9 @@
 void quark_hash(void *state, const void *input)
 {
   uint32_t hash[16] __attribute__((aligned(64)));
-   sph_blake512_context    ctx_blake;
+   blake512_context        ctx_blake;
   sph_bmw512_context      ctx_bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_groestl       ctx_groestl;
 #else
   sph_groestl512_context  ctx_groestl;
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
   sph_keccak512_context   ctx_keccak;
   uint32_t mask = 8;

-   sph_blake512_init( &ctx_blake );
-   sph_blake512( &ctx_blake, input, 80 );
-   sph_blake512_close( &ctx_blake, hash );
-
+   blake512_full( &ctx_blake, hash, input, 80 );
+   
   sph_bmw512_init( &ctx_bmw );
   sph_bmw512( &ctx_bmw, hash, 64 );
   sph_bmw512_close( &ctx_bmw, hash ); 

   if ( hash[0] & mask )
   {
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
      init_groestl( &ctx_groestl, 64 );
      update_and_final_groestl( &ctx_groestl, (char*)hash,
                                        (const char*)hash, 512 );
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
      sph_skein512_close( &ctx_skein, hash );
   }

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   init_groestl( &ctx_groestl, 64 );
   update_and_final_groestl( &ctx_groestl, (char*)hash,
                                     (const char*)hash, 512 );
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)

   if ( hash[0] & mask )
   {
-      sph_blake512_init( &ctx_blake );
-      sph_blake512( &ctx_blake, hash, 64 );
-      sph_blake512_close( &ctx_blake, hash );
+      blake512_full( &ctx_blake, hash, hash, 64 );
   }
   else
   {
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,

     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );

     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,

     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
-     casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+     casti_v128(  endiandata, 4 ) = v128_bswap32(   casti_v128(  pdata, 4 ) );

     uint64_t *edata = (uint64_t*)endiandata;
     intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
+                      | NEON_OPT;
  return true;
 };

--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -8,13 +8,9 @@
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h" 
-#if defined(__aarch64__)
-  #include "algo/simd/sph_simd.h"
-#else
-  #include "algo/simd/nist.h"
-#endif
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
 #include "algo/echo/aes_ni/hash_api.h"
 #else
 #include "algo/echo/sph_echo.h"
@@ -25,12 +21,8 @@ typedef struct
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
-   sph_simd512_context     simd;
-#else
-   hashState_sd            simd;
-#endif
-#ifdef __AES__
+        simd512_context         simd;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_echo          echo;
 #else
        sph_echo512_context echo;
@@ -45,12 +37,7 @@ void init_qubit_ctx()
        init_luffa(&qubit_ctx.luffa,512);
        cubehashInit(&qubit_ctx.cubehash,512,16,32);
        sph_shavite512_init(&qubit_ctx.shavite);
-#if defined(__aarch64__)
-   sph_simd512_init( &qubit_ctx.simd );
-#else
-   init_sd( &qubit_ctx.simd, 512 );
-#endif
-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        init_echo(&qubit_ctx.echo, 512);
 #else
        sph_echo512_init(&qubit_ctx.echo);
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);

-#if defined(__aarch64__)
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx.simd, hash);
-#else
-    update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
-    final_sd( &ctx.simd, (BitSequence *)hash );
-#endif
-
-#ifdef __AES__
+        simd512_ctx( &ctx.simd, hash, hash, 64 );
+        
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        update_final_echo( &ctx.echo, (BitSequence *) hash,
                     (const BitSequence *) hash, 512 );
 #else
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -45,10 +45,10 @@ static const uint32_t IV[5] =

 #define RR(a, b, c, d, e, f, s, r, k) \
 do{ \
-   a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
+   a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
                                 _mm_set1_epi64x( k ) ), s ), e ); \
-   c = mm128_rol_32( c, 10 );\
+   c = v128_rol32( c, 10 );\
 } while (0)

 #define ROUND1(a, b, c, d, e, f, s, r, k)  \
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -506,4 +506,156 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
     return 0;
 }

+#elif defined (X11GOST_2WAY)
+
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
+#endif
+
+union _x11gost_context_overlay
+{
+        blake512_2x64_context   blake;
+        bmw512_2x64_context     bmw;
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context  groestl;
+#endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_echo          echo;
+#else
+        sph_echo512_context     echo;
+#endif
+        jh512_2x64_context      jh;
+        keccak512_2x64_context  keccak;
+        skein512_2x64_context   skein;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        simd512_context         simd;
+        sph_gost512_context     gost;
+};
+typedef union _x11gost_context_overlay x11gost_context_overlay;
+
+int x11gost_2x64_hash( void *state, const void *input, int thr_id )
+{
+    uint8_t vhash[80*2] __attribute__((aligned(64)));
+    uint8_t hash0[64]   __attribute__((aligned(64)));
+    uint8_t hash1[64]   __attribute__((aligned(64)));
+    x11gost_context_overlay ctx;
+
+    intrlv_2x64( vhash, input, input+80, 640 );
+
+    blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
+    bmw512_2x64_init( &ctx.bmw );
+    bmw512_2x64_update( &ctx.bmw, vhash, 64 );
+    bmw512_2x64_close( &ctx.bmw, vhash );
+
+    dintrlv_2x64( hash0, hash1, vhash, 512 );
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    groestl512_full( &ctx.groestl, hash0, hash0, 512 );
+    groestl512_full( &ctx.groestl, hash1, hash1, 512 );
+#else
+    sph_groestl512_init( &ctx.groestl );
+    sph_groestl512( &ctx.groestl, hash0, 64 );
+    sph_groestl512_close( &ctx.groestl, hash0 );
+    sph_groestl512_init( &ctx.groestl );
+    sph_groestl512( &ctx.groestl, hash1, 64 );
+    sph_groestl512_close( &ctx.groestl, hash1 );
+#endif
+
+    intrlv_2x64( vhash, hash0, hash1, 512 );
+
+    skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
+    jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
+    keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
+
+    dintrlv_2x64( hash0, hash1, vhash, 512 );
+    
+    sph_gost512_init( &ctx.gost );
+    sph_gost512( &ctx.gost, hash0, 64 );
+    sph_gost512_close( &ctx.gost, hash0 );
+    sph_gost512_init( &ctx.gost );
+    sph_gost512( &ctx.gost, hash1, 64 );
+    sph_gost512_close( &ctx.gost, hash1 );
+
+    luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
+    luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
+
+    cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
+    cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
+
+    sph_shavite512_init( &ctx.shavite );
+    sph_shavite512( &ctx.shavite, hash0, 64 );
+    sph_shavite512_close( &ctx.shavite, hash0 );
+    sph_shavite512_init( &ctx.shavite );
+    sph_shavite512( &ctx.shavite, hash1, 64 );
+    sph_shavite512_close( &ctx.shavite, hash1 );
+
+    simd512_ctx( &ctx.simd, hash0, hash0, 64 );
+    simd512_ctx( &ctx.simd, hash1, hash1, 64 );
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+    echo_full( &ctx.echo, hash0, 512, hash0, 64 );
+    echo_full( &ctx.echo, hash1, 512, hash1, 64 );
+#else
+    sph_echo512_init( &ctx.echo );
+    sph_echo512( &ctx.echo, hash0, 64 );
+    sph_echo512_close( &ctx.echo, hash0 );
+    sph_echo512_init( &ctx.echo );
+    sph_echo512( &ctx.echo, hash1, 64 );
+    sph_echo512_close( &ctx.echo, hash1 );
+#endif
+
+    memcpy( state,    hash0, 32 );
+    memcpy( state+32, hash1, 32 );
+
+    return 1;
+}
+
+int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*2]   __attribute__((aligned(64)));
+   uint32_t edata[20*2]   __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   v128_bswap32_80( edata, pdata );
+   memcpy( edata+20, edata, 80 );
+
+   do
+   {
+      edata[19] = n;
+      edata[39] = n+1;
+      if ( likely( x11gost_2x64_hash( hash, edata, thr_id ) ) )
+      {
+         if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = bswap_32( n );
+            submit_solution( work, hash, mythr );
+         }
+         if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
+         {
+            pdata[19] = bswap_32( n+1 );
+            submit_solution( work, hash+8, mythr );
+         }
+      }
+      n += 2;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );
+   *hashes_done = n - first_nonce;
+   pdata[19] = n;
+   return 0;
+}
+
+
 #endif
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -2,20 +2,24 @@

 bool register_x11gost_algo( algo_gate_t* gate )
 {
-#if defined (X11GOST_8WAY)
+#if defined(X11GOST_8WAY)
  init_x11gost_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_8way;
  gate->hash      = (void*)&x11gost_8way_hash;
-#elif defined (X11GOST_4WAY)
+#elif defined(X11GOST_4WAY)
  init_x11gost_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost_4way;
  gate->hash      = (void*)&x11gost_4way_hash;
+#elif defined(X11GOST_2WAY)
+  gate->scanhash  = (void*)&scanhash_x11gost_2x64;
+  gate->hash      = (void*)&x11gost_2x64_hash;
 #else
  init_x11gost_ctx();
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT 
+                      | NEON_OPT;
  return true;
 };

--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -8,6 +8,8 @@
  #define X11GOST_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X11GOST_4WAY 1
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+  #define X11GOST_2WAY 1
 #endif

 bool register_x11gost_algo( algo_gate_t* gate );
@@ -26,6 +28,12 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
 void init_x11gost_4way_ctx();

+#elif defined(X11GOST_2WAY)
+
+int x11gost_2x64_hash( void *state, const void *input, int thr_id );
+int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 void x11gost_hash( void *state, const void *input );
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,6 +1,8 @@
 #include "x11gost-gate.h"

-#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY)
+// no longer used, not working when last used.
+
+#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY) && !defined(X11GOST_2WAY)

 #include <stdlib.h>
 #include <stdint.h>
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -155,13 +155,13 @@ void skunk_4way_hash( void *output, const void *input )
     skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash0, hash0, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash1, hash1, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash2, hash2, 64 );
     memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
-     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash3, hash3, 64 );

     fugue512_full( &ctx.fugue, hash0, hash0, 64 );
     fugue512_full( &ctx.fugue, hash1, hash1, 64 );
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -14,9 +14,6 @@
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
-//#if defined(__aarch64__)
-//  #include "algo/simd/sph_simd.h"
-//#endif
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -32,7 +29,7 @@
 #else
  #include "algo/groestl/sph_groestl.h"
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
@@ -60,7 +57,7 @@ struct TortureGarden
 #else
   sph_echo512_context     echo;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   hashState_fugue         fugue;
 #else
   sph_fugue512_context    fugue;
@@ -116,7 +113,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
 #endif
 	         break;
        case 4:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            fugue512_full( &garden->fugue, hash, input, 64 );
 #else
            sph_fugue512_full( &garden->fugue, hash, input, 64 );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -1022,7 +1022,7 @@ void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
         fugue512_init( &x16r_ctx.fugue );
         fugue512_update( &x16r_ctx.fugue, edata, 76 );
 #else         
@@ -1218,7 +1218,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
 #endif
            break;
         case FUGUE:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            if ( i == 0 )
            {
               fugue512_update( &ctx.fugue, in0 + 76, 4 );
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -240,7 +240,7 @@ union _x16r_2x64_context_overlay
 #else
    sph_hamsi512_context    hamsi;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_fugue         fugue;
 #else
    sph_fugue512_context    fugue;
@@ -267,7 +267,7 @@ union _x16r_context_overlay
 {
        blake512_context        blake;
        sph_bmw512_context      bmw;
-#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_groestl       groestl;
 #else
        sph_groestl512_context  groestl;
@@ -285,7 +285,7 @@ union _x16r_context_overlay
        sph_echo512_context     echo;
 #endif
        sph_hamsi512_context    hamsi;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -1230,7 +1230,7 @@ union _x16rv2_2x64_context_overlay
 #else
    sph_hamsi512_context    hamsi;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    hashState_fugue         fugue;
 #else
    sph_fugue512_context    fugue;
@@ -1445,7 +1445,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
 #endif
         break;
         case FUGUE:
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
            if ( i == 0 )
            {
               fugue512_update( &ctx.fugue, in0 + 76, 4 );
@@ -1607,7 +1607,7 @@ int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
      break;
      case FUGUE:
         v128_bswap32_80( edata, pdata );
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
         fugue512_init( &x16rv2_ctx.fugue );
         fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
 #else
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -928,11 +928,8 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,

 #elif defined(X17_2X64)

-// Need sph in some cases
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-//#include "algo/simd/sph_simd.h"
-//#include "algo/simd/nist.h"
 #if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
  #include "algo/hamsi/sph_hamsi.h"
 #endif
@@ -940,11 +937,9 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
 #include "algo/haval/sph-haval.h"
 #if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
  #include "algo/groestl/sph_groestl.h"
-#endif
-#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
  #include "algo/echo/sph_echo.h"
+  #include "algo/fugue/sph_fugue.h"
 #endif
-#include "algo/fugue/sph_fugue.h"

 union _x17_context_overlay
 {
@@ -960,7 +955,7 @@ union _x17_context_overlay
 #else
        sph_echo512_context     echo;
 #endif
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
@@ -1061,7 +1056,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
    sph_hamsi512_close( &ctx.hamsi, hash1 );
 #endif

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
    fugue512_full( &ctx.fugue, hash0, hash0, 64 );
    fugue512_full( &ctx.fugue, hash1, hash1, 64 );
 #else
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -4,7 +4,7 @@

 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
@@ -38,7 +38,7 @@ union _x22i_context_overlay
 {
        blake512_context       blake;
        sph_bmw512_context     bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
@@ -127,7 +127,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
   sph_hamsi512_close(&ctx.hamsi, hash);

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   fugue512_full( &ctx.fugue, hash, hash, 64 );
 #else
   sph_fugue512_init(&ctx.fugue);
@@ -147,7 +147,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_sha512( &ctx.sha512, &hash[128], 64 );
   sph_sha512_close( &ctx.sha512, &hash[192] );
   
-   ComputeSingleSWIFFTX((unsigned char*)hash, (unsigned char*)hash2);
+   ComputeSingleSWIFFTX( (unsigned char*)hash, (unsigned char*)hash2 );

   if ( work_restart[thrid].restart ) return 0;
   
@@ -162,7 +162,7 @@ int x22i_hash( void *output, const void *input, int thrid )
   sph_tiger_close(&ctx.tiger, (void*) hash2);

   memset(hash, 0, 64);
-   LYRA2RE((void*) hash, 32, (const void*) hash2, 32, (const void*) hash2, 32, 1, 4, 4);
+   LYRA2RE( (void*)hash, 32, (const void*)hash2, 32, (const void*)hash2, 32, 1, 4, 4 );

   sph_gost512_init(&ctx.gost);
   sph_gost512 (&ctx.gost, (const void*) hash, 64);
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -4,7 +4,7 @@

 #include "algo/blake/blake512-hash.h"
 #include "algo/bmw/sph_bmw.h"
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
  #include "algo/fugue/fugue-aesni.h"
 #else
  #include "algo/fugue/sph_fugue.h"
@@ -41,7 +41,7 @@ union _x25x_context_overlay
 {
        blake512_context        blake;
        sph_bmw512_context      bmw;
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
        hashState_fugue         fugue;
 #else
        sph_fugue512_context    fugue;
@@ -132,7 +132,7 @@ int x25x_hash( void *output, const void *input, int thrid )
   sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
   sph_hamsi512_close(&ctx.hamsi, &hash[11]);

-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
   fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
 #else
   sph_fugue512_init(&ctx.fugue);
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.15.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='23.14'
-PACKAGE_STRING='cpuminer-opt 23.14'
+PACKAGE_VERSION='23.15'
+PACKAGE_STRING='cpuminer-opt 23.15'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 23.15 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 23.15:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 23.14
+cpuminer-opt configure 23.15
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 23.14, which was
+It was created by cpuminer-opt $as_me 23.15, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='23.14'
+ VERSION='23.15'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 23.14, which was
+This file was extended by cpuminer-opt $as_me 23.15, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 23.14
+cpuminer-opt config.status 23.15
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [23.14])
+AC_INIT([cpuminer-opt], [23.15])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/4325
+++ b/4325
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -207,7 +207,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )

 #endif

-// broadcast lane l to all lanes
+// broadcast (replicate) lane l to all lanes
 #define v128_replane64( v, l ) \
   ( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
                : _mm_shuffle_epi32( v, 0xee )
@@ -319,29 +319,27 @@ static inline __m128i v128_neg1_fn()
 //    c[7:6] source element selector

 // Convert type and abbreviate name: eXtract Insert Mask = XIM
-#define mm128_xim_32( v1, v0, c ) \
+#define v128_xim32( v1, v0, c ) \
   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
                                    _mm_castsi128_ps( v0 ), c ) )
-#define v128_xim32 mm128_xim_32

 // Examples of simple operations using xim:
 /*
 // Copy i32 to element c of dest and copy remaining elemnts from v.
 #define v128_put32( v, i32, c ) \
-      mm128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
+      v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
 */


-#define mm128_mask_32( v, m )    mm128_xim_32( v, v, m )
+#define v128_mask32( v, m )    v128_xim32( v, v, m & 0xf )

 // Zero 32 bit elements when corresponding bit in 4 bit mask is set.
-//static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
-//{   return mm128_xim_32( v, v, m ); }
-#define v128_mask32    mm128_mask_32
+//static inline __m128i v128_mask32( const __m128i v, const int m ) 
+//{   return v128_xim32( v, v, m ); }

-// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
+// Copy element l0 of v0 to element l1 of dest and copy remaining elements from v1.
 #define v128_movlane32( v1, l1, v0, l0 ) \
-  mm128_xim_32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
+  v128_xim32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )

 #endif  // SSE4_1

@@ -452,7 +450,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #define v128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )

-#define v128_xnor( a, b )         mm128_not( _mm_xor_si128( a, b ) )
+#define v128_xnor( a, b )         v128_not( _mm_xor_si128( a, b ) )

 #endif

@@ -483,7 +481,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_qrev16(v)      v128_shuffle16( v, 0x1b )
 #define v128_lrev16(v)      v128_shuffle16( v, 0xb1 )

-// These should never be callled from application code, use rol/ror.
+// Internal use only, should never be callled from application code.
 #define v128_ror64_sse2( v, c ) \
   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )

@@ -498,14 +496,14 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #if defined(__AVX512VL__)

-// AVX512 fastest all rotations.
+// AVX512 fastest for all rotations.
 #define v128_ror64                _mm_ror_epi64
 #define v128_rol64                _mm_rol_epi64
 #define v128_ror32                _mm_ror_epi32
 #define v128_rol32                _mm_rol_epi32

 // ror/rol will always find the fastest but these names may fit better with
-// application code performing shuffles rather than bit rotations.
+// application code performing byte operations rather than bit rotations.
 #define v128_shuflr64_8( v)         _mm_ror_epi64( v,  8 )
 #define v128_shufll64_8( v)         _mm_rol_epi64( v,  8 )
 #define v128_shuflr64_16(v)         _mm_ror_epi64( v, 16 )
@@ -577,7 +575,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 : v128_rol32_sse2( v, c )

 #elif defined(__SSE2__)
-// SSE2: fastest 32 bit, very fast 16
+// SSE2: fastest 32 bit, very fast 16, all else slow

 #define v128_ror64( v, c ) \
   ( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
@@ -608,9 +606,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 #endif

-//#define v128_ror64            mm128_ror_64
-//#define v128_rol64            mm128_rol_64
-//#define v128_ror32            mm128_ror_32
+// deprecated
 #define mm128_rol_32        v128_rol32

 /* not used
@@ -633,7 +629,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
   _mm_ror_epi32( v0, c ); \
   _mm_ror_epi32( v1, c )

-#define mm128_2rol32( v1, v0, c ) \
+#define v128_2rol32( v1, v0, c ) \
   _mm_rol_epi32( v0, c ); \
   _mm_rol_epi32( v1, c )

@@ -684,11 +680,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )

 // Cross lane shuffles

+// No NEON version
 #define v128_shuffle32     _mm_shuffle_epi32

-// shuffle using vector mask, for compatibility with NEON
+/* Not used, exists only for compatibility with NEON if ever needed.
 #define v128_shufflev32( v, vmask ) \
  v128_shuffle32( v, mm128_movmask_32( vmask ) )
+*/

 #define v128_shuffle8     _mm_shuffle_epi8

@@ -697,12 +695,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_shuffle2_64( v1, v2, c ) \
   _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
                                     _mm_castsi128_pd( v2 ), c ) ); 
-#define mm128_shuffle2_64   v128_shuffle2_64

 #define v128_shuffle2_32( v1, v2, c ) \
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
                                     _mm_castsi128_ps( v2 ), c ) ); 
-#define mm128_shuffle2_32   v128_shuffle2_32

 // Rotate vector elements accross all lanes

@@ -734,6 +730,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
 #define v128_bswap32( v ) \
   _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                        0x0405060700010203 ) )
+// deprecated
 #define mm128_bswap_32      v128_bswap32

 #define v128_bswap16( v ) \
--- a/simd-utils/simd-neon.h
+++ b/simd-utils/simd-neon.h
@@ -68,7 +68,7 @@
 #define v128_mul32                    vmulq_u32
 #define v128_mul16                    vmulq_u16

-// Widening, shuffle high element to align with Intel
+// Widening multiply, align source elements with Intel
 static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 {
   return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
@@ -97,7 +97,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_cmplt16( v1, v0 )        vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
 #define v128_cmplt8( v1, v0 )         vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )

-// bit shift
+// Logical bit shift
 #define v128_sl64                     vshlq_n_u64
 #define v128_sl32                     vshlq_n_u32
 #define v128_sl16                     vshlq_n_u16
@@ -108,7 +108,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
 #define v128_sr16                     vshrq_n_u16
 #define v128_sr8                      vshrq_n_u8

-// Unit tested, working.
+// Arithmetic shift.
 #define v128_sra64( v, c )            vshrq_n_s64( (int64x2_t)v, c )
 #define v128_sra32( v, c )            vshrq_n_s32( (int32x4_t)v, c )
 #define v128_sra16( v, c )            vshrq_n_s16( (int16x8_t)v, c )
@@ -255,24 +255,24 @@ typedef union
 #define v128_8                         vmovq_n_u8

 #define v64_set32( u32_1, u32_0 ) \
-   vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
+  vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )

 #define v64_set16( u16_3, u16_2, u16_1, u16_0 ) \
-    vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16 ) \
-                               | (uint32_t)(u16_2)       ) << 32 ) \
-               | ( (uint64_t)( ( (uint32_t)(u16_1) << 16 ) \
-                               | (uint32_t)(u16_0)       )       ) )
+  vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16) \
+                             | (uint32_t)(u16_2)       ) << 32 ) \
+             | ( (uint64_t)( ( (uint32_t)(u16_1) << 16) \
+                             | (uint32_t)(u16_0)       )       ) )

 #define v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \
-    vcreate_u8( \
-     ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_7) << 8 ) \
-                               | (uint16_t)(u8_6)      ) << 16 ) \
-                 | ( (uint32_t)(((uint16_t)(u8_5) << 8 ) \
-                               | (uint16_t)(u8_4)      )       )) << 32 )  \
-   | ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_3) << 8 ) \
-                               | (uint16_t)(u8_2)      ) << 16 ) \
-                 | ( (uint32_t)(((uint16_t)(u8_1) << 8 ) \
-                               | (uint16_t)(u8_0)      )       ))       ))
+  vcreate_u8( \
+     ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_7) << 8) \
+                                | (uint16_t)(u8_6)      ) << 16 ) \
+                 | ( (uint32_t)( ((uint16_t)(u8_5) << 8) \
+                                | (uint16_t)(u8_4)      )       ) ) << 32 )  \
+   | ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_3) << 8) \
+                                | (uint16_t)(u8_2)      ) << 16 ) \
+                 | ( (uint32_t)( ((uint16_t)(u8_1) << 8) \
+                                | (uint16_t)(u8_0)      )       ) )       ) )

 #define v128_set64( u64_1, u64_0 ) \
   vcombine_u64( vcreate_u64( u64_0 ), vcreate_u64( u64_1 ) ) 
@@ -406,15 +406,17 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
 v1 = vorrq_u32( v1, t1 ); \
 }

+/* not used anywhere and hopefully never will
 // vector mask, use as last resort. prefer tbl, rev, alignr, etc
 #define v128_shufflev32( v, vmask ) \
  v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
              ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
+*/

 #define v128_shuffle8( v, vmask ) \
-     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask );
+     vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )

 // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
 // Bit rotation already promotes faster widths. Usage is context sensitive.
@@ -532,20 +534,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
   casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
 }

-// Prograsmmable shuffles
-// no compatible shuffles with x86_64, will require targeted user code.
-              
-#define v128_extractmask8( df, de, dd, dc, db, da, d9, d8, \
-                           d7, d6, d5, d4, d3, d2, d1, d0, vmask )   \
-  d0 = ((uint8_t*)(&vmask))[0];   d1 = ((uint8_t*)(&vmask))[1]; \
-  d2 = ((uint8_t*)(&vmask))[2];   d3 = ((uint8_t*)(&vmask))[3]; \
-  d4 = ((uint8_t*)(&vmask))[0];   d5 = ((uint8_t*)(&vmask))[1]; \
-  d6 = ((uint8_t*)(&vmask))[2];   d7 = ((uint8_t*)(&vmask))[3]; \
-  d8 = ((uint8_t*)(&vmask))[0];   d9 = ((uint8_t*)(&vmask))[1]; \
-  da = ((uint8_t*)(&vmask))[2];   db = ((uint8_t*)(&vmask))[3]; \
-  dc = ((uint8_t*)(&vmask))[0];   dd = ((uint8_t*)(&vmask))[1]; \
-  de = ((uint8_t*)(&vmask))[2];   df = ((uint8_t*)(&vmask))[3]; 
-
 // Blendv
 #define v128_blendv( v1, v0, mask ) \
   v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )