v3.23.4

2025-09-17 23:44:27 +00:00 · 2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions
--- a/algo/argon2/argon2a/ar2/opt.c
+++ b/algo/argon2/argon2a/ar2/opt.c
@@ -17,6 +17,8 @@
 #include <stdio.h>
 #include <inttypes.h>

+#if defined(__SSE2__)
+
 #include <immintrin.h>

 #include "argon2.h"
@@ -183,3 +185,5 @@ void ar2_fill_segment(const argon2_instance_t *instance,

    free(pseudo_rands);
 }
+
+#endif
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -114,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
   uint32_t nonce = first_nonce;
   const bool bench = opt_benchmark;

-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
   do
   {
      edata[19] = nonce;
@@ -160,7 +160,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
   uint32_t parallelism = 1; // 1 thread, 2 lanes
   const bool bench = opt_benchmark;

-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );

   do {
      edata[19] = n;
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -131,22 +131,22 @@ static void fill_block(__m256i *state, const block *ref_block,

 #else  // SSE2

-static void fill_block(__m128i *state, const block *ref_block,
+static void fill_block( v128_t *state, const block *ref_block,
                       block *next_block, int with_xor) {
-    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
+    v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
    unsigned int i;

    if (with_xor) {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-            state[i] = _mm_xor_si128(
-                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
-            block_XY[i] = _mm_xor_si128(
-                state[i], _mm_load_si128((const __m128i *)next_block->v + i));
+            state[i] = v128_xor(
+                state[i], v128_load((const v128_t *)ref_block->v + i));
+            block_XY[i] = v128_xor(
+                state[i], v128_load((const v128_t *)next_block->v + i));
        }
    } else {
        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = _mm_xor_si128(
-                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
+            block_XY[i] = state[i] = v128_xor(
+                state[i], v128_load((const v128_t *)ref_block->v + i));
        }
    }

@@ -185,8 +185,8 @@ static void fill_block(__m128i *state, const block *ref_block,
                  state[39], state[47], state[55], state[63] );

    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-        state[i] = _mm_xor_si128(state[i], block_XY[i]);
-        _mm_store_si128((__m128i *)next_block->v + i, state[i]);
+        state[i] = v128_xor(state[i], block_XY[i]);
+        v128_store((v128_t *)next_block->v + i, state[i]);
    }
 }

@@ -202,8 +202,8 @@ static void next_addresses(block *address_block, block *input_block) {
    __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
    __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
 #else
-    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
-    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+    v128_t zero_block[ARGON2_OWORDS_IN_BLOCK];
+    v128_t zero2_block[ARGON2_OWORDS_IN_BLOCK];
 #endif

    memset(zero_block, 0, sizeof(zero_block));
@@ -232,7 +232,7 @@ void fill_segment(const argon2_instance_t *instance,
 #elif defined(__AVX2__)
    __m256i state[ARGON2_HWORDS_IN_BLOCK];
 #else
-    __m128i state[ARGON2_OWORDS_IN_BLOCK];
+    v128_t state[ARGON2_OWORDS_IN_BLOCK];
 #endif
 //    int data_independent_addressing;

--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -19,16 +19,6 @@
 #define BLAKE_ROUND_MKA_OPT_H

 #include "blake2-impl.h"
-
-#include <emmintrin.h>
-#if defined(__SSSE3__)
-#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
-#endif
-
-#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
-#include <x86intrin.h>
-#endif
-
 #include "simd-utils.h"

 #if !defined(__AVX512F__)
@@ -39,7 +29,7 @@
    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
 #define r24                                                                    \
    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
-#define _mm_roti_epi64(x, c)                                                   \
+#define v128_ror64(x, c)                                                   \
    (-(c) == 32)                                                               \
        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
        : (-(c) == 24)                                                         \
@@ -47,20 +37,20 @@
              : (-(c) == 16)                                                   \
                    ? _mm_shuffle_epi8((x), r16)                               \
                    : (-(c) == 63)                                             \
-                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
-                                          _mm_add_epi64((x), (x)))             \
-                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
-                                          _mm_slli_epi64((x), 64 - (-(c))))
+                          ? v128_xor(v128_sr64((x), -(c)),           \
+                                          v128_add64((x), (x)))             \
+                          : v128_xor(v128_sr64((x), -(c)),           \
+                                          v128_sl64((x), 64 - (-(c))))
 #else /* defined(__SSE2__) */
-#define _mm_roti_epi64(r, c)                                                   \
-    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#define v128_ror64(r, c)                                                   \
+    v128_xor(v128_sr64((r), -(c)), v128_sl64((r), 64 - (-(c))))
 #endif
 #else
 #endif

-static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
-    const __m128i z = _mm_mul_epu32(x, y);
-    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
+    const v128_t z = v128_mul32(x, y);
+    return v128_add64(v128_add64(x, y), v128_add64(z, z));
 }

 #define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
@@ -68,20 +58,20 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        A0 = fBlaMka(A0, B0);                                                  \
        A1 = fBlaMka(A1, B1);                                                  \
                                                                               \
-        D0 = _mm_xor_si128(D0, A0);                                            \
-        D1 = _mm_xor_si128(D1, A1);                                            \
+        D0 = v128_xor(D0, A0);                                            \
+        D1 = v128_xor(D1, A1);                                            \
                                                                               \
-        D0 = _mm_roti_epi64(D0, -32);                                          \
-        D1 = _mm_roti_epi64(D1, -32);                                          \
+        D0 = v128_ror64(D0, -32);                                          \
+        D1 = v128_ror64(D1, -32);                                          \
                                                                               \
        C0 = fBlaMka(C0, D0);                                                  \
        C1 = fBlaMka(C1, D1);                                                  \
                                                                               \
-        B0 = _mm_xor_si128(B0, C0);                                            \
-        B1 = _mm_xor_si128(B1, C1);                                            \
+        B0 = v128_xor(B0, C0);                                            \
+        B1 = v128_xor(B1, C1);                                            \
                                                                               \
-        B0 = _mm_roti_epi64(B0, -24);                                          \
-        B1 = _mm_roti_epi64(B1, -24);                                          \
+        B0 = v128_ror64(B0, -24);                                          \
+        B1 = v128_ror64(B1, -24);                                          \
    } while ((void)0, 0)

 #define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
@@ -89,27 +79,27 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        A0 = fBlaMka(A0, B0);                                                  \
        A1 = fBlaMka(A1, B1);                                                  \
                                                                               \
-        D0 = _mm_xor_si128(D0, A0);                                            \
-        D1 = _mm_xor_si128(D1, A1);                                            \
+        D0 = v128_xor(D0, A0);                                            \
+        D1 = v128_xor(D1, A1);                                            \
                                                                               \
-        D0 = _mm_roti_epi64(D0, -16);                                          \
-        D1 = _mm_roti_epi64(D1, -16);                                          \
+        D0 = v128_ror64(D0, -16);                                          \
+        D1 = v128_ror64(D1, -16);                                          \
                                                                               \
        C0 = fBlaMka(C0, D0);                                                  \
        C1 = fBlaMka(C1, D1);                                                  \
                                                                               \
-        B0 = _mm_xor_si128(B0, C0);                                            \
-        B1 = _mm_xor_si128(B1, C1);                                            \
+        B0 = v128_xor(B0, C0);                                            \
+        B1 = v128_xor(B1, C1);                                            \
                                                                               \
-        B0 = _mm_roti_epi64(B0, -63);                                          \
-        B1 = _mm_roti_epi64(B1, -63);                                          \
+        B0 = v128_ror64(B0, -63);                                          \
+        B1 = v128_ror64(B1, -63);                                          \
    } while ((void)0, 0)

 #if defined(__SSSE3__)
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
    do {                                                                       \
-        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
-        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        v128_t t0 = v128_alignr8(B1, B0, 8);                               \
+        v128_t t1 = v128_alignr8(B0, B1, 8);                               \
        B0 = t0;                                                               \
        B1 = t1;                                                               \
                                                                               \
@@ -117,16 +107,16 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        C0 = C1;                                                               \
        C1 = t0;                                                               \
                                                                               \
-        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
-        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t0 = v128_alignr8(D1, D0, 8);                                       \
+        t1 = v128_alignr8(D0, D1, 8);                                       \
        D0 = t1;                                                               \
        D1 = t0;                                                               \
    } while ((void)0, 0)

 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
    do {                                                                       \
-        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
-        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        v128_t t0 = v128_alignr8(B0, B1, 8);                               \
+        v128_t t1 = v128_alignr8(B1, B0, 8);                               \
        B0 = t0;                                                               \
        B1 = t1;                                                               \
                                                                               \
@@ -134,37 +124,37 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        C0 = C1;                                                               \
        C1 = t0;                                                               \
                                                                               \
-        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
-        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t0 = v128_alignr8(D0, D1, 8);                                       \
+        t1 = v128_alignr8(D1, D0, 8);                                       \
        D0 = t1;                                                               \
        D1 = t0;                                                               \
    } while ((void)0, 0)
 #else /* SSE2 */
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
    do {                                                                       \
-        __m128i t0 = D0;                                                       \
-        __m128i t1 = B0;                                                       \
+        v128_t t0 = D0;                                                       \
+        v128_t t1 = B0;                                                       \
        D0 = C0;                                                               \
        C0 = C1;                                                               \
        C1 = D0;                                                               \
-        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
-        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
-        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
-        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
+        D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0));               \
+        D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1));               \
+        B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1));               \
+        B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1));               \
    } while ((void)0, 0)

 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
    do {                                                                       \
-        __m128i t0, t1;                                                        \
+        v128_t t0, t1;                                                        \
        t0 = C0;                                                               \
        C0 = C1;                                                               \
        C1 = t0;                                                               \
        t0 = B0;                                                               \
        t1 = D0;                                                               \
-        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
-        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
-        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
-        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
+        B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0));               \
+        B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1));               \
+        D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1));               \
+        D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1));               \
    } while ((void)0, 0)
 #endif

@@ -462,4 +452,5 @@ static inline __m512i muladd(__m512i x, __m512i y)
    } while ((void)0, 0)

 #endif /* __AVX512F__ */
+
 #endif /* BLAKE_ROUND_MKA_OPT_H */
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -34,7 +34,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
   if (opt_benchmark)
      HTarget = 0x7f;

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r14_4way_init( &blake_4w_ctx );
   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -277,56 +277,56 @@ static const unsigned sigma[16][16] = {

 #define BLAKE256_ROUND( r ) \
 { \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \
                                          CSx( r, 5 ) ^ Mx( r, 4 ), \
                                          CSx( r, 3 ) ^ Mx( r, 2 ), \
                                          CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
-   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
+   V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \
                                          CSx( r, 4 ) ^ Mx( r, 5 ), \
                                          CSx( r, 2 ) ^ Mx( r, 3 ), \
                                          CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
-   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
-   V0 = mm128_shufll_32( V0 ); \
-   V3 = mm128_swap_64( V3 ); \
-   V2 = mm128_shuflr_32( V2 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
+   V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
+   V0 = v128_shufll32( V0 ); \
+   V3 = v128_swap64( V3 ); \
+   V2 = v128_shuflr32( V2 ); \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, D ) ^ Mx( r, C ), \
                                          CSx( r, B ) ^ Mx( r, A ), \
                                          CSx( r, 9 ) ^ Mx( r, 8 ), \
                                          CSx( r, F ) ^ Mx( r, E ) ) ) ); \
-   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
+   V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, C ) ^ Mx( r, D ), \
                                          CSx( r, A ) ^ Mx( r, B ), \
                                          CSx( r, 8 ) ^ Mx( r, 9 ), \
                                          CSx( r, E ) ^ Mx( r, F ) ) ) ); \
-   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
-   V0 = mm128_shuflr_32( V0 ); \
-   V3 = mm128_swap_64( V3 ); \
-   V2 = mm128_shufll_32( V2 ); \
+   V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
+   V0 = v128_shuflr32( V0 ); \
+   V3 = v128_swap64( V3 ); \
+   V2 = v128_shufll32( V2 ); \
 }

 // Default is 14 rounds, blakecoin & vanilla are 8.
 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                            const uint32_t T0, const uint32_t T1, int rounds )
 {
-   __m128i V0, V1, V2, V3;
+   v128_t V0, V1, V2, V3;
   uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
-   V0 = casti_m128i( H, 0 );
-   V1 = casti_m128i( H, 1 );
-   V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
-   V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
+   V0 = casti_v128( H, 0 );
+   V1 = casti_v128( H, 1 );
+   V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
+   V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
                       T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
   M0 = buf[ 0];
   M1 = buf[ 1];
@@ -361,8 +361,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
      BLAKE256_ROUND( 2 );
      BLAKE256_ROUND( 3 );
   }
-   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
-   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
+   casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V0, V2 ) );
+   casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V1, V3 ) );
 }

 ////////////////////////////////////////////
@@ -371,16 +371,16 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 { \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
-                      _mm_xor_si128( v128_32( c1 ), m0 ) ); \
-   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
-                      _mm_xor_si128( v128_32( c0 ), m1 ) ); \
-   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
+   a = v128_add32( v128_add32( a, b ), \
+                      v128_xor( v128_32( c1 ), m0 ) ); \
+   d = v128_swap32_16( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ), 12 ); \
+   a = v128_add32( v128_add32( a, b ), \
+                      v128_xor( v128_32( c0 ), m1 ) ); \
+   d = v128_shuflr32_8( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ), 7 ); \
 }

 #define ROUND_S_4WAY(r) \
@@ -396,31 +396,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 }

 #define DECL_STATE32_4WAY \
-	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
+	v128_t H0, H1, H2, H3, H4, H5, H6, H7; \
        uint32_t T0, T1;

 #define READ_STATE32_4WAY(state)   do { \
-		H0 = casti_m128i( state->H, 0 ); \
-		H1 = casti_m128i( state->H, 1 ); \
-		H2 = casti_m128i( state->H, 2 ); \
-		H3 = casti_m128i( state->H, 3 ); \
-		H4 = casti_m128i( state->H, 4 ); \
-		H5 = casti_m128i( state->H, 5 ); \
-		H6 = casti_m128i( state->H, 6 ); \
-		H7 = casti_m128i( state->H, 7 ); \
+		H0 = casti_v128( state->H, 0 ); \
+		H1 = casti_v128( state->H, 1 ); \
+		H2 = casti_v128( state->H, 2 ); \
+		H3 = casti_v128( state->H, 3 ); \
+		H4 = casti_v128( state->H, 4 ); \
+		H5 = casti_v128( state->H, 5 ); \
+		H6 = casti_v128( state->H, 6 ); \
+		H7 = casti_v128( state->H, 7 ); \
 		T0 = (state)->T0; \
 		T1 = (state)->T1; \
 	} while (0)

 #define WRITE_STATE32_4WAY(state)   do { \
-		casti_m128i( state->H, 0 ) = H0; \
-		casti_m128i( state->H, 1 ) = H1; \
-		casti_m128i( state->H, 2 ) = H2; \
-		casti_m128i( state->H, 3 ) = H3; \
-		casti_m128i( state->H, 4 ) = H4; \
-		casti_m128i( state->H, 5 ) = H5; \
-		casti_m128i( state->H, 6 ) = H6; \
-		casti_m128i( state->H, 7 ) = H7; \
+		casti_v128( state->H, 0 ) = H0; \
+		casti_v128( state->H, 1 ) = H1; \
+		casti_v128( state->H, 2 ) = H2; \
+		casti_v128( state->H, 3 ) = H3; \
+		casti_v128( state->H, 4 ) = H4; \
+		casti_v128( state->H, 5 ) = H5; \
+		casti_v128( state->H, 6 ) = H6; \
+		casti_v128( state->H, 7 ) = H7; \
 		(state)->T0 = T0; \
 		(state)->T1 = T1; \
 	} while (0)
@@ -430,7 +430,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 #define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
-   __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
+   v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                          0x0405060700010203 ); \
   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
@@ -454,32 +454,32 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 #define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
-   M0 = mm128_bswap_32( buf[0] ); \
-   M1 = mm128_bswap_32( buf[1] ); \
-   M2 = mm128_bswap_32( buf[2] ); \
-   M3 = mm128_bswap_32( buf[3] ); \
-   M4 = mm128_bswap_32( buf[4] ); \
-   M5 = mm128_bswap_32( buf[5] ); \
-   M6 = mm128_bswap_32( buf[6] ); \
-   M7 = mm128_bswap_32( buf[7] ); \
-   M8 = mm128_bswap_32( buf[8] ); \
-   M9 = mm128_bswap_32( buf[9] ); \
-   MA = mm128_bswap_32( buf[10] ); \
-   MB = mm128_bswap_32( buf[11] ); \
-   MC = mm128_bswap_32( buf[12] ); \
-   MD = mm128_bswap_32( buf[13] ); \
-   ME = mm128_bswap_32( buf[14] ); \
-   MF = mm128_bswap_32( buf[15] ); \
+   M0 = v128_bswap32( buf[0] ); \
+   M1 = v128_bswap32( buf[1] ); \
+   M2 = v128_bswap32( buf[2] ); \
+   M3 = v128_bswap32( buf[3] ); \
+   M4 = v128_bswap32( buf[4] ); \
+   M5 = v128_bswap32( buf[5] ); \
+   M6 = v128_bswap32( buf[6] ); \
+   M7 = v128_bswap32( buf[7] ); \
+   M8 = v128_bswap32( buf[8] ); \
+   M9 = v128_bswap32( buf[9] ); \
+   MA = v128_bswap32( buf[10] ); \
+   MB = v128_bswap32( buf[11] ); \
+   MC = v128_bswap32( buf[12] ); \
+   MD = v128_bswap32( buf[13] ); \
+   ME = v128_bswap32( buf[14] ); \
+   MF = v128_bswap32( buf[15] ); \
 }

 #endif  // SSSE3 else SSE2

 #define COMPRESS32_4WAY( rounds ) \
 { \
-   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
-   __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
-   __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
-   __m128i V8, V9, VA, VB, VC, VD, VE, VF; \
+   v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
+   v128_t M8, M9, MA, MB, MC, MD, ME, MF; \
+   v128_t V0, V1, V2, V3, V4, V5, V6, V7; \
+   v128_t V8, V9, VA, VB, VC, VD, VE, VF; \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -514,14 +514,14 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
      ROUND_S_4WAY(2); \
      ROUND_S_4WAY(3); \
   } \
-   H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
-   H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
-   H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
-   H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
-   H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
-   H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
-   H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
-   H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
+   H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \
+   H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \
+   H2 = v128_xor( v128_xor( VA, V2 ), H2 ); \
+   H3 = v128_xor( v128_xor( VB, V3 ), H3 ); \
+   H4 = v128_xor( v128_xor( VC, V4 ), H4 ); \
+   H5 = v128_xor( v128_xor( VD, V5 ), H5 ); \
+   H6 = v128_xor( v128_xor( VE, V6 ), H6 ); \
+   H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \
 }

 #if defined (__AVX2__)
@@ -1867,14 +1867,14 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
-   casti_m128i( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
-   casti_m128i( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
-   casti_m128i( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
-   casti_m128i( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
-   casti_m128i( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
-   casti_m128i( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
-   casti_m128i( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
-   casti_m128i( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
+   casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
+   casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
+   casti_v128( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
+   casti_v128( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
+   casti_v128( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
+   casti_v128( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
+   casti_v128( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
+   casti_v128( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
   ctx->T0 = ctx->T1 = 0;
   ctx->ptr = 0;
   ctx->rounds = rounds;
@@ -1884,7 +1884,7 @@ static void
 blake32_4way( blake_4way_small_context *ctx, const void *data,
              size_t len )
 {
-   __m128i *buf = (__m128i*)ctx->buf;
+   v128_t *buf = (v128_t*)ctx->buf;
   size_t  bptr = ctx->ptr<<2;
   size_t  vptr = ctx->ptr >> 2;
   size_t  blen = len << 2;
@@ -1925,7 +1925,7 @@ static void
 blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
               void *dst, size_t out_size_w32 )
 {
-   __m128i buf[16] __attribute__ ((aligned (64)));
+   v128_t buf[16] __attribute__ ((aligned (64)));
   size_t   ptr     = ctx->ptr;
   size_t   vptr    = ctx->ptr>>2;
   unsigned bit_len = ( (unsigned)ptr << 3 );
@@ -1949,26 +1949,26 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,

   if ( vptr < 12 )
   {
-      memset_zero_128( buf + vptr + 1, 13 - vptr  );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
+      v128_memset_zero( buf + vptr + 1, 13 - vptr  );
+      buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = v128_32( bswap_32( th ) );
      buf[ 15 ] = v128_32( bswap_32( tl ) );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
   }
   else
   {
-      memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
+      v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
      ctx->T0 = 0xFFFFFE00UL;
      ctx->T1 = 0xFFFFFFFFUL;
-      memset_zero_128( buf, 56>>2 );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
+      v128_memset_zero( buf, 56>>2 );
+      buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = v128_32( bswap_32( th ) );
      buf[ 15 ] = v128_32( bswap_32( tl ) );
      blake32_4way( ctx, buf, 64 );
   }

-   mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
+   v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H );
 }

 #if defined (__AVX2__)
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -138,7 +138,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;

-   mm128_bswap32_80( endiandata, pdata );
+   v128_bswap32_80( endiandata, pdata );

   do {
      endiandata[19] = n;
--- a/algo/blake/blake2s-hash.c
+++ b/algo/blake/blake2s-hash.c
@@ -12,13 +12,13 @@
 */

 #include "blake2s-hash.h"
-
+#include "simd-utils.h"
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>

 //#if defined(__SSE4_2__)
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 /*
 static const uint32_t blake2s_IV[8] =
@@ -78,43 +78,43 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )

   /* IV XOR ParamBlock */
   for ( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
+      S->h[i] = v128_xor( S->h[i], v128_32( p[i] ) );
   return 0;
 }

-int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
+int blake2s_4way_compress( blake2s_4way_state *S, const v128_t* block )
 {
-   __m128i m[16];
-   __m128i v[16];
+   v128_t m[16];
+   v128_t v[16];

-   memcpy_128( m, block, 16 );
-   memcpy_128( v, S->h, 8 );
+   v128_memcpy( m, block, 16 );
+   v128_memcpy( v, S->h, 8 );

   v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
   v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
   v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
   v[11] = v128_64( 0xA54FF53AA54FF53AULL );
-   v[12] = _mm_xor_si128( v128_32( S->t[0] ),
+   v[12] = v128_xor( v128_32( S->t[0] ),
                          v128_64( 0x510E527F510E527FULL ) );
-   v[13] = _mm_xor_si128( v128_32( S->t[1] ),
+   v[13] = v128_xor( v128_32( S->t[1] ),
                          v128_64( 0x9B05688C9B05688CULL ) );
-   v[14] = _mm_xor_si128( v128_32( S->f[0] ),
+   v[14] = v128_xor( v128_32( S->f[0] ),
                          v128_64( 0x1F83D9AB1F83D9ABULL ) );
-   v[15] = _mm_xor_si128( v128_32( S->f[1] ),
+   v[15] = v128_xor( v128_32( S->f[1] ),
                          v128_64( 0x5BE0CD195BE0CD19ULL ) );

 #define G4W( sigma0, sigma1, a, b, c, d ) \
 do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ),  7 ); \
+   a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \
+   d = v128_swap32_16( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ), 12 ); \
+   a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \
+   d = v128_shuflr32_8( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ),  7 ); \
 } while(0)


@@ -143,7 +143,7 @@ do { \
   ROUND4W( 9 );

   for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = v128_xor( v128_xor( S->h[i], v[i] ), v[i + 8] );

 #undef G4W
 #undef ROUND4W
@@ -175,26 +175,26 @@ do { \
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                         uint64_t inlen )
 {
-   __m128i *input = (__m128i*)in;
-   __m128i *buf = (__m128i*)S->buf;
+   v128_t *input = (v128_t*)in;
+   v128_t *buf = (v128_t*)S->buf;

   while( inlen > 0 )
   {
      size_t left = S->buflen;
-      if( inlen >= BLAKE2S_BLOCKBYTES - left )
+      if( inlen >= 64 - left )
      {
-         memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
-         S->buflen += BLAKE2S_BLOCKBYTES - left;
-         S->t[0] += BLAKE2S_BLOCKBYTES;
-         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         v128_memcpy( buf + (left>>2), input, (64 - left) >> 2 );
+         S->buflen += 64 - left;
+         S->t[0] += 64;
+         S->t[1] += ( S->t[0] < 64 );
         blake2s_4way_compress( S, buf ); 
         S->buflen = 0;
-         input += ( BLAKE2S_BLOCKBYTES >> 2 );
-         inlen -= BLAKE2S_BLOCKBYTES;
+         input += ( 64 >> 2 );
+         inlen -= 64;
      }
      else
      {
-          memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
+          v128_memcpy( buf + ( left>>2 ), input, inlen>>2 );
          S->buflen += (size_t) inlen; 
          input += ( inlen>>2 );
          inlen -= inlen;
@@ -205,7 +205,7 @@ int blake2s_4way_update( blake2s_4way_state *S, const void *in,

 int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
 {
-   __m128i *buf = (__m128i*)S->buf;
+   v128_t *buf = (v128_t*)S->buf;

   S->t[0] += S->buflen;
   S->t[1] += ( S->t[0] < S->buflen );
@@ -213,12 +213,12 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
      S->f[1] = ~0U;
   S->f[0] = ~0U;

-   memset_zero_128( buf + ( S->buflen>>2 ),
-                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );      
+   v128_memset_zero( buf + ( S->buflen>>2 ),
+                    ( 64 - S->buflen ) >> 2 );      
   blake2s_4way_compress( S, buf );

   for ( int i = 0; i < 8; ++i )
-      casti_m128i( out, i ) = S->h[ i ];
+      casti_v128( out, i ) = S->h[ i ];
   return 0;
 }

@@ -226,24 +226,24 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
 int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
                              const void *input, uint64_t inlen )
 {
-    __m128i *in = (__m128i*)input;
-    __m128i *buf = (__m128i*)S->buf;
+    v128_t *in = (v128_t*)input;
+    v128_t *buf = (v128_t*)S->buf;

-    while( inlen > BLAKE2S_BLOCKBYTES )
+    while( inlen > 64 )
    {
-       memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-       S->buflen = BLAKE2S_BLOCKBYTES;
-       inlen -= BLAKE2S_BLOCKBYTES;
-       S->t[0] += BLAKE2S_BLOCKBYTES;
-       S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+       v128_memcpy( buf, in, 64 >> 2 );
+       S->buflen = 64;
+       inlen -= 64;
+       S->t[0] += 64;
+       S->t[1] += ( S->t[0] < 64 );
       blake2s_4way_compress( S, buf );
       S->buflen = 0;
-       in += ( BLAKE2S_BLOCKBYTES >> 2 );
+       in += ( 64 >> 2 );
    }

    // last block
-    memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-    S->buflen = BLAKE2S_BLOCKBYTES;
+    v128_memcpy( buf, in, 64 >> 2 );
+    S->buflen = 64;
    S->t[0] += S->buflen;
    S->t[1] += ( S->t[0] < S->buflen );
    if ( S->last_node )  S->f[1] = ~0U;
@@ -251,7 +251,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
    blake2s_4way_compress( S, buf );

    for ( int i = 0; i < 8; ++i )
-      casti_m128i( out, i ) = S->h[ i ];
+      casti_v128( out, i ) = S->h[ i ];
    return 0;
 }

@@ -417,7 +417,7 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
 {
  __m256i *input = (__m256i*)in;
  __m256i *buf = (__m256i*)S->buf;
-  const int bsize = BLAKE2S_BLOCKBYTES;
+  const int bsize = 64;

   while( inlen > 0 )
   {
@@ -426,8 +426,8 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
      {
         memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
         S->buflen += bsize - left;
-         S->t[0] += BLAKE2S_BLOCKBYTES;
-         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         S->t[0] += 64;
+         S->t[1] += ( S->t[0] < 64 );
         blake2s_8way_compress( S, buf );
         S->buflen = 0;
         input += ( bsize >> 2 );
@@ -454,8 +454,7 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
      S->f[1] = ~0U;
   S->f[0] = ~0U;

-   memset_zero_256( buf + ( S->buflen>>2 ),
-                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   memset_zero_256( buf + ( S->buflen>>2 ),( 64 - S->buflen ) >> 2 );
   blake2s_8way_compress( S, buf );

   for ( int i = 0; i < 8; ++i )
@@ -470,21 +469,21 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
    __m256i *in = (__m256i*)input;
    __m256i *buf = (__m256i*)S->buf;

-    while( inlen > BLAKE2S_BLOCKBYTES )
+    while( inlen > 64 )
    {
-       memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-       S->buflen = BLAKE2S_BLOCKBYTES;
-       inlen -= BLAKE2S_BLOCKBYTES;
-       S->t[0] += BLAKE2S_BLOCKBYTES;
-       S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+       memcpy_256( buf, in, 64 >> 2 );
+       S->buflen = 64;
+       inlen -= 64;
+       S->t[0] += 64;
+       S->t[1] += ( S->t[0] < 64 );
       blake2s_8way_compress( S, buf );
       S->buflen = 0;
-       in += ( BLAKE2S_BLOCKBYTES >> 2 );
+       in += ( 64 >> 2 );
    }

    // last block
-    memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-    S->buflen = BLAKE2S_BLOCKBYTES;
+    memcpy_256( buf, in, 64 >> 2 );
+    S->buflen = 64;
    S->t[0] += S->buflen;
    S->t[1] += ( S->t[0] < S->buflen );
    if ( S->last_node )  S->f[1] = ~0U;
@@ -611,7 +610,7 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
 {
  __m512i *input = (__m512i*)in;
  __m512i *buf = (__m512i*)S->buf;
-  const int bsize = BLAKE2S_BLOCKBYTES;
+  const int bsize = 64;

   while( inlen > 0 )
   {
@@ -620,8 +619,8 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
      {
         memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
         S->buflen += bsize - left;
-         S->t[0] += BLAKE2S_BLOCKBYTES;
-         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         S->t[0] += 64;
+         S->t[1] += ( S->t[0] < 64 );
         blake2s_16way_compress( S, buf );
         S->buflen = 0;
         input += ( bsize >> 2 );
@@ -649,7 +648,7 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
   S->f[0] = ~0U;

   memset_zero_512( buf + ( S->buflen>>2 ),
-                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+                    ( 64 - S->buflen ) >> 2 );
   blake2s_16way_compress( S, buf );

   for ( int i = 0; i < 8; ++i )
--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -14,7 +14,7 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 #include "simd-utils.h"

@@ -29,41 +29,25 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-enum blake2s_constant
-{
-   BLAKE2S_BLOCKBYTES = 64,
-   BLAKE2S_OUTBYTES   = 32,
-   BLAKE2S_KEYBYTES   = 32,
-   BLAKE2S_SALTBYTES  = 8,
-   BLAKE2S_PERSONALBYTES = 8
-};
-
-#pragma pack(push, 1)
-typedef struct __blake2s_nway_param
-{
-   uint8_t  digest_length; // 1
-   uint8_t  key_length;    // 2
-   uint8_t  fanout;        // 3
-   uint8_t  depth;         // 4
-   uint32_t leaf_length;   // 8
-   uint8_t  node_offset[6];// 14
-   uint8_t  node_depth;    // 15
-   uint8_t  inner_length;  // 16
-   // uint8_t  reserved[0];
-   uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
-   uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
-} blake2s_nway_param;
-#pragma pack(pop)
+   typedef struct __blake2s_nway_param
+   {
+      uint8_t  digest_length; // 1
+      uint8_t  key_length;    // 2
+      uint8_t  fanout;        // 3
+      uint8_t  depth;         // 4
+      uint32_t leaf_length;   // 8
+      uint8_t  node_offset[6];// 14
+      uint8_t  node_depth;    // 15
+      uint8_t  inner_length;  // 16
+      // uint8_t  reserved[0];
+      uint8_t  salt[8]; // 24
+      uint8_t  personal[8];  // 32
+   } blake2s_nway_param;

 typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
-   __m128i h[8];
-   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
+   v128_t h[8];
+   uint8_t  buf[ 64 * 4 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -83,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
 typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
   __m256i h[8];
-   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
+   uint8_t  buf[ 32 * 8 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -104,7 +88,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
   __m512i h[8];
-   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
+   uint8_t  buf[ 32 * 16 ];
   uint32_t t[2];
   uint32_t f[2];
   size_t   buflen;
@@ -127,10 +111,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
 	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
 #endif

-#if defined(__cplusplus)
-}
-#endif
-
 #endif  // __SSE2__

 #endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -20,7 +20,7 @@ void blake2s_16way_hash( void *output, const void *input )
   blake2s_16way_state ctx;
   memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
   blake2s_16way_update( &ctx, input + (64<<4), 16 );
-   blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
+   blake2s_16way_final( &ctx, output, 32 );
 }

 int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
@@ -39,7 +39,7 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  

   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_16way_init( &blake2s_16w_ctx, 32 );
   blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );

   do {
@@ -76,7 +76,7 @@ void blake2s_8way_hash( void *output, const void *input )
   blake2s_8way_state ctx;
   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
   blake2s_8way_update( &ctx, input + (64<<3), 16 );
-   blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
+   blake2s_8way_final( &ctx, output, 32 );
 }

 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
@@ -95,7 +95,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id; 

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_8way_init( &blake2s_8w_ctx, 32 );
   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );

   do {
@@ -131,7 +131,7 @@ void blake2s_4way_hash( void *output, const void *input )
   blake2s_4way_state ctx;
   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
-   blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
+   blake2s_4way_final( &ctx, output, 32 );
 }

 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
@@ -149,8 +149,8 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
+   blake2s_4way_init( &blake2s_4w_ctx, 32 );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

   do {
@@ -183,12 +183,12 @@ static __thread blake2s_state blake2s_ctx;

 void blake2s_hash( void *output, const void *input )
 {
-   unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
+   unsigned char _ALIGN(32) hash[32];
   blake2s_state ctx __attribute__ ((aligned (32)));

   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
   blake2s_update( &ctx, input+64, 16 );
-   blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
+   blake2s_final( &ctx, hash, 32 );

   memcpy(output, hash, 32);
 }
@@ -201,14 +201,13 @@ int scanhash_blake2s( struct work *work,uint32_t max_nonce,
   uint32_t _ALIGN(32) hash32[8];
   uint32_t _ALIGN(32) endiandata[20];
   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;

-   mm128_bswap32_80( endiandata, pdata );
+   v128_bswap32_80( endiandata, pdata );

   // midstate
-   blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
+   blake2s_init( &blake2s_ctx, 32 );
   blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );

   do
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -343,52 +343,52 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,

 #define BLAKE512_G( r,  Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
 { \
-   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                            _mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
+   Va = v128_add64( Va, v128_add64( Vb, \
+                            v128_set_64( CBx( r, Sd ) ^ Mx( r, Sc ), \
                                            CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
-   Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
-   Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
+   Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
 \
-   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                            _mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
+   Va = v128_add64( Va, v128_add64( Vb, \
+                            v128_set_64( CBx( r, Sc ) ^ Mx( r, Sd ), \
                                            CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
-   Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
-   Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
+   Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
 }

 #define BLAKE512_ROUND( R ) \
 { \
-   __m128i V32, V23, V67, V76; \
+   v128_t V32, V23, V67, V76; \
   BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V32 = mm128_alignr_64( V[3], V[2], 1 ); \
-   V23 = mm128_alignr_64( V[2], V[3], 1 ); \
-   V67 = mm128_alignr_64( V[6], V[7], 1 ); \
-   V76 = mm128_alignr_64( V[7], V[6], 1 ); \
+   V32 = v128_alignr64( V[3], V[2], 1 ); \
+   V23 = v128_alignr64( V[2], V[3], 1 ); \
+   V67 = v128_alignr64( V[6], V[7], 1 ); \
+   V76 = v128_alignr64( V[7], V[6], 1 ); \
   BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
   BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
-   V[2] = mm128_alignr_64( V32, V23, 1 ); \
-   V[3] = mm128_alignr_64( V23, V32, 1 ); \
-   V[6] = mm128_alignr_64( V76, V67, 1 ); \
-   V[7] = mm128_alignr_64( V67, V76, 1 ); \
+   V[2] = v128_alignr64( V32, V23, 1 ); \
+   V[3] = v128_alignr64( V23, V32, 1 ); \
+   V[6] = v128_alignr64( V76, V67, 1 ); \
+   V[7] = v128_alignr64( V67, V76, 1 ); \
 }

 void blake512_transform( uint64_t *H, const uint64_t *buf,
                         const uint64_t T0, const uint64_t T1 )
 {
-   __m128i V[8];
+   v128_t V[8];
   uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;

-   V[0] = casti_m128i( H, 0 );
-   V[1] = casti_m128i( H, 1 );
-   V[2] = casti_m128i( H, 2 );
-   V[3] = casti_m128i( H, 3 );
-   V[4] = _mm_set_epi64x( CB1, CB0 );
-   V[5] = _mm_set_epi64x( CB3, CB2 );
-   V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
-   V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
+   V[0] = casti_v128( H, 0 );
+   V[1] = casti_v128( H, 1 );
+   V[2] = casti_v128( H, 2 );
+   V[3] = casti_v128( H, 3 );
+   V[4] = v128_set_64( CB1, CB0 );
+   V[5] = v128_set_64( CB3, CB2 );
+   V[6] = v128_set_64( T0 ^ CB5, T0 ^ CB4 );
+   V[7] = v128_set_64( T1 ^ CB7, T1 ^ CB6 );

   M0 = bswap_64( buf[ 0] );
   M1 = bswap_64( buf[ 1] );
@@ -424,10 +424,10 @@ void blake512_transform( uint64_t *H, const uint64_t *buf,
   BLAKE512_ROUND( 4 );
   BLAKE512_ROUND( 5 );

-   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
-   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
-   casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
-   casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
+   casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V[0], V[4] ) );
+   casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V[1], V[5] ) );
+   casti_v128( H, 2 ) = v128_xor( casti_v128( H, 2 ), v128_xor( V[2], V[6] ) );
+   casti_v128( H, 3 ) = v128_xor( casti_v128( H, 3 ), v128_xor( V[3], V[7] ) );
 }

 #endif
@@ -611,7 +611,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  VD = v512_64( T0 ^ CB5 ); \
  VE = v512_64( T1 ^ CB6 ); \
  VF = v512_64( T1 ^ CB7 ); \
-  const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
+  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( \
                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -679,7 +679,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  VE = v512_64( sc->T1 ^ CB6 );
  VF = v512_64( sc->T1 ^ CB7 );

-  const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( 
+  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( 
                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
@@ -1347,7 +1347,7 @@ blake512_8way_close(void *cc, void *dst)
  VD = v256_64( T0 ^ CB5 ); \
  VE = v256_64( T1 ^ CB6 ); \
  VF = v256_64( T1 ^ CB7 ); \
-  const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
+  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64( \
                             0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -1419,7 +1419,7 @@ void blake512_4way_compress( blake_4way_big_context *sc )
                             v256_64( CB6 ) );
  VF = _mm256_xor_si256( v256_64( sc->T1 ),
                             v256_64( CB7 ) );
-  const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
+  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64(
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -177,7 +177,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      HTarget = 0x7f;

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r8_4way_init( &blakecoin_4w_ctx );
   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -118,15 +118,15 @@ static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_
 	return 0;
 }

-static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
+static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[8] )
 {
-	memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
+	memcpy( P->salt, salt, 8 );
 	return 0;
 }

-static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
+static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[8] )
 {
-	memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
+	memcpy( P->personal, personal, 8 );
 	return 0;
 }

@@ -159,7 +159,7 @@ int blake2s_init( blake2s_state *S, const uint8_t outlen )
 	blake2s_param P[1];

 	/* Move interval verification here? */
-	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+	if ( ( !outlen ) || ( outlen > 32 ) ) return -1;

 	P->digest_length = outlen;
 	P->key_length    = 0;
@@ -179,9 +179,9 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
 {
 	blake2s_param P[1];

-	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+	if ( ( !outlen ) || ( outlen > 32 ) ) return -1;

-	if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+	if ( !key || !keylen || keylen > 8 ) return -1;

 	P->digest_length = outlen;
 	P->key_length    = keylen;
@@ -198,16 +198,16 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
 	if( blake2s_init_param( S, P ) < 0 ) return -1;

 	{
-		uint8_t block[BLAKE2S_BLOCKBYTES];
-		memset( block, 0, BLAKE2S_BLOCKBYTES );
+		uint8_t block[64];
+		memset( block, 0, 64 );
 		memcpy( block, key, keylen );
-		blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
-		secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+		blake2s_update( S, block, 64 );
+		secure_zero_memory( block, 64 ); /* Burn the key from stack */
 	}
 	return 0;
 }

-int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 {
 	uint32_t _ALIGN(32) m[16];
 	uint32_t _ALIGN(32) v[16];
@@ -329,16 +329,16 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
 	while( inlen > 0 )
 	{
 		size_t left = S->buflen;
-		size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
+		size_t fill = 2 * 64 - left;

 		if( inlen > fill )
 		{
 			memcpy( S->buf + left, in, fill ); // Fill buffer
 			S->buflen += fill;
-			blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+			blake2s_increment_counter( S, 64 );
 			blake2s_compress( S, S->buf ); // Compress
-			memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
-			S->buflen -= BLAKE2S_BLOCKBYTES;
+			memcpy( S->buf, S->buf + 64, 64 ); // Shift buffer left
+			S->buflen -= 64;
 			in += fill;
 			inlen -= fill;
 		}
@@ -356,19 +356,19 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )

 int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
 {
-	uint8_t buffer[BLAKE2S_OUTBYTES];
+	uint8_t buffer[32];

-	if( S->buflen > BLAKE2S_BLOCKBYTES )
+	if( S->buflen > 64 )
 	{
-		blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+		blake2s_increment_counter( S, 64 );
 		blake2s_compress( S, S->buf );
-		S->buflen -= BLAKE2S_BLOCKBYTES;
-		memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
+		S->buflen -= 64;
+		memcpy( S->buf, S->buf + 64, S->buflen );
 	}

 	blake2s_increment_counter( S, ( uint32_t )S->buflen );
 	blake2s_set_lastblock( S );
-	memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+	memset( S->buf + S->buflen, 0, 2 * 64 - S->buflen ); /* Padding */
 	blake2s_compress( S, S->buf );

 	for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
@@ -408,10 +408,10 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
 #include "blake2-kat.h" /* test data not included */
 int main( int argc, char **argv )
 {
-	uint8_t key[BLAKE2S_KEYBYTES];
+	uint8_t key[8];
 	uint8_t buf[KAT_LENGTH];

-	for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
+	for( size_t i = 0; i < 8; ++i )
 		key[i] = ( uint8_t )i;

 	for( size_t i = 0; i < KAT_LENGTH; ++i )
@@ -419,10 +419,10 @@ int main( int argc, char **argv )

 	for( size_t i = 0; i < KAT_LENGTH; ++i )
 	{
-		uint8_t hash[BLAKE2S_OUTBYTES];
-		blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
+		uint8_t hash[32];
+		blake2s( hash, buf, key, 32, i, );

-		if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+		if( 0 != memcmp( hash, blake2s_keyed_kat[i], 32 ) )
 		{
 			puts( "error" );
 			return -1;
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -87,19 +87,6 @@ static inline void secure_zero_memory(void *v, size_t n)

 /* blake2.h */

-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-	enum blake2s_constant
-	{
-		BLAKE2S_BLOCKBYTES = 64,
-		BLAKE2S_OUTBYTES   = 32,
-		BLAKE2S_KEYBYTES   = 32,
-		BLAKE2S_SALTBYTES  = 8,
-		BLAKE2S_PERSONALBYTES = 8
-	};
-
 #pragma pack(push, 1)
 	typedef struct __blake2s_param
 	{
@@ -112,8 +99,8 @@ extern "C" {
 		uint8_t  node_depth;    // 15
 		uint8_t  inner_length;  // 16
 		// uint8_t  reserved[0];
-		uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
-		uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+		uint8_t  salt[8]; // 24
+		uint8_t  personal[8];  // 32
 	} blake2s_param;

 	typedef struct ALIGN( 64 ) __blake2s_state
@@ -121,13 +108,13 @@ extern "C" {
 		uint32_t h[8];
 		uint32_t t[2];
 		uint32_t f[2];
-		uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+		uint8_t  buf[2 * 64];
 		size_t   buflen;
 		uint8_t  last_node;
 	} blake2s_state ;
 #pragma pack(pop)

-	int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
+	int blake2s_compress( blake2s_state *S, const uint8_t block[64] );

 	// Streaming API
 	int blake2s_init( blake2s_state *S, const uint8_t outlen );
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -95,6 +95,43 @@
 }
 */

+#elif defined(__SSE2__) || defined(__NEON__)   // ready for NEON
+
+#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
+{ \
+   Va = v128_add64( Va, v128_add64( Vb, \
+                 v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+   Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \
+\
+   Va = v128_add64( Va, v128_add64( Vb, \
+                 v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+   Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   __m128i *V = (__m128i*)v; \
+   __m128i V2, V3, V6, V7; \
+   const uint8_t *sigmaR = sigma[R]; \
+   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
+   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
+   V2 = v128_alignr64( V[3], V[2], 1 ); \
+   V3 = v128_alignr64( V[2], V[3], 1 ); \
+   V6 = v128_alignr64( V[6], V[7], 1 ); \
+   V7 = v128_alignr64( V[7], V[6], 1 ); \
+   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
+   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
+   V[2] = v128_alignr64( V2, V3, 1 ); \
+   V[3] = v128_alignr64( V3, V2, 1 ); \
+   V[6] = v128_alignr64( V7, V6, 1 ); \
+   V[7] = v128_alignr64( V6, V7, 1 ); \
+}
+
+/*
 #elif defined(__SSE2__)
 // always true

@@ -131,6 +168,7 @@
   V[6] = mm128_alignr_64( V7, V6, 1 ); \
   V[7] = mm128_alignr_64( V6, V7, 1 ); \
 }
+*/

 #else
 // never used, SSE2 is always available
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -1,13 +1,6 @@
 /* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
 #define CUBEHASH_ROUNDS	16
 #define CUBEHASH_BLOCKBYTES 32
-#define OPTIMIZE_SSE2
-#if defined(OPTIMIZE_SSE2)
-#include <emmintrin.h>
-#endif
-#ifdef __AVX2__
-#include <immintrin.h>
-#endif
 #include "cubehash_sse2.h"
 #include <stdbool.h>
 #include <unistd.h>
@@ -80,70 +73,73 @@ static void transform( cubehashParam *sp )
    _mm256_store_si256( (__m256i*)sp->x + 2, x2 );
    _mm256_store_si256( (__m256i*)sp->x + 3, x3 );

-#else
-    __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+#else   // AVX, SSE2, NEON

-    x0 = _mm_load_si128( (__m128i*)sp->x     );
-    x1 = _mm_load_si128( (__m128i*)sp->x + 1 );
-    x2 = _mm_load_si128( (__m128i*)sp->x + 2 );
-    x3 = _mm_load_si128( (__m128i*)sp->x + 3 );
-    x4 = _mm_load_si128( (__m128i*)sp->x + 4 );
-    x5 = _mm_load_si128( (__m128i*)sp->x + 5 );
-    x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
-    x7 = _mm_load_si128( (__m128i*)sp->x + 7 );
+#pragma message "NEON for Cubehash"
+
+    v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+
+    x0 = casti_v128( sp->x, 0 );
+    x1 = casti_v128( sp->x, 1 );
+    x2 = casti_v128( sp->x, 2 );
+    x3 = casti_v128( sp->x, 3 );
+    x4 = casti_v128( sp->x, 4 );
+    x5 = casti_v128( sp->x, 5 );
+    x6 = casti_v128( sp->x, 6 );
+    x7 = casti_v128( sp->x, 7 );

    for ( r = 0; r < rounds; ++r )
    {
-       x4 = _mm_add_epi32( x0, x4 );
-       x5 = _mm_add_epi32( x1, x5 );
-       x6 = _mm_add_epi32( x2, x6 );
-       x7 = _mm_add_epi32( x3, x7 );
+       x4 = v128_add32( x0, x4 );
+       x5 = v128_add32( x1, x5 );
+       x6 = v128_add32( x2, x6 );
+       x7 = v128_add32( x3, x7 );
       y0 = x2;
       y1 = x3;
       y2 = x0;
       y3 = x1;
-       x0 = mm128_rol_32( y0, 7 );
-       x1 = mm128_rol_32( y1, 7 );
-       x2 = mm128_rol_32( y2, 7 );
-       x3 = mm128_rol_32( y3, 7 );
-       x0 = _mm_xor_si128( x0, x4 );
-       x1 = _mm_xor_si128( x1, x5 );
-       x2 = _mm_xor_si128( x2, x6 );
-       x3 = _mm_xor_si128( x3, x7 );
-       x4 = _mm_shuffle_epi32( x4, 0x4e );
-       x5 = _mm_shuffle_epi32( x5, 0x4e );
-       x6 = _mm_shuffle_epi32( x6, 0x4e );
-       x7 = _mm_shuffle_epi32( x7, 0x4e );
-       x4 = _mm_add_epi32( x0, x4 );
-       x5 = _mm_add_epi32( x1, x5 );
-       x6 = _mm_add_epi32( x2, x6 );
-       x7 = _mm_add_epi32( x3, x7 );
+       x0 = v128_rol32( y0, 7 );
+       x1 = v128_rol32( y1, 7 );
+       x2 = v128_rol32( y2, 7 );
+       x3 = v128_rol32( y3, 7 );
+       x0 = v128_xor( x0, x4 );
+       x1 = v128_xor( x1, x5 );
+       x2 = v128_xor( x2, x6 );
+       x3 = v128_xor( x3, x7 );
+       x4 = v128_swap64( x4 );
+       x5 = v128_swap64( x5 );
+       x6 = v128_swap64( x6 );
+       x7 = v128_swap64( x7 );
+       x4 = v128_add32( x0, x4 );
+       x5 = v128_add32( x1, x5 );
+       x6 = v128_add32( x2, x6 );
+       x7 = v128_add32( x3, x7 );
       y0 = x1;
       y1 = x0;
       y2 = x3;
       y3 = x2;
-       x0 = mm128_rol_32( y0, 11 );
-       x1 = mm128_rol_32( y1, 11 );
-       x2 = mm128_rol_32( y2, 11 );
-       x3 = mm128_rol_32( y3, 11 );
-	    x0 = _mm_xor_si128( x0, x4 );
-	    x1 = _mm_xor_si128( x1, x5 );
-	    x2 = _mm_xor_si128( x2, x6 );
-	    x3 = _mm_xor_si128( x3, x7 );
-	    x4 = _mm_shuffle_epi32( x4, 0xb1 );
-	    x5 = _mm_shuffle_epi32( x5, 0xb1 );
-	    x6 = _mm_shuffle_epi32( x6, 0xb1 );
-	    x7 = _mm_shuffle_epi32( x7, 0xb1 );
+       x0 = v128_rol32( y0, 11 );
+       x1 = v128_rol32( y1, 11 );
+       x2 = v128_rol32( y2, 11 );
+       x3 = v128_rol32( y3, 11 );
+	    x0 = v128_xor( x0, x4 );
+	    x1 = v128_xor( x1, x5 );
+	    x2 = v128_xor( x2, x6 );
+	    x3 = v128_xor( x3, x7 );
+	    x4 = v128_swap64_32( x4 );
+	    x5 = v128_swap64_32( x5 );
+	    x6 = v128_swap64_32( x6 );
+	    x7 = v128_swap64_32( x7 );
    }

-    _mm_store_si128( (__m128i*)sp->x,     x0 );
-    _mm_store_si128( (__m128i*)sp->x + 1, x1 );
-    _mm_store_si128( (__m128i*)sp->x + 2, x2 );
-    _mm_store_si128( (__m128i*)sp->x + 3, x3 );
-    _mm_store_si128( (__m128i*)sp->x + 4, x4 );
-    _mm_store_si128( (__m128i*)sp->x + 5, x5 );
-    _mm_store_si128( (__m128i*)sp->x + 6, x6 );
-    _mm_store_si128( (__m128i*)sp->x + 7, x7 );
+    casti_v128( sp->x, 0 ) = x0;
+    casti_v128( sp->x, 1 ) = x1;
+    casti_v128( sp->x, 2 ) = x2;
+    casti_v128( sp->x, 3 ) = x3;
+    casti_v128( sp->x, 4 ) = x4;
+    casti_v128( sp->x, 5 ) = x5;
+    casti_v128( sp->x, 6 ) = x6;
+    casti_v128( sp->x, 7 ) = x7;

 #endif
 }  // transform
@@ -170,7 +166,7 @@ static const uint64_t IV512[] =

 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
-    __m128i *x = (__m128i*)sp->x;
+    v128_t *x = (v128_t*)sp->x;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = blockbytes/16;
    sp->rounds    = rounds;
@@ -179,34 +175,34 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
    if ( hashbitlen == 512 )
    {

-       x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
    }
    else
    {
-       x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
    }   

-    return SUCCESS;
+    return 0;
 }

-int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
+int cubehashUpdate( cubehashParam *sp, const void *data, size_t size )
 {
    const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
+    const v128_t* in = (v128_t*)data;
    int i;

    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -214,7 +210,7 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )

    for ( i = 0; i < len; i++ )
    {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
        sp->pos++;
        if ( sp->pos == sp->blocksize )
        {
@@ -223,20 +219,20 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
        }
    }

-    return SUCCESS;
+    return 0;
 }

-int cubehashDigest( cubehashParam *sp, byte *digest )
+int cubehashDigest( cubehashParam *sp, void *digest )
 {
-    __m128i* hash = (__m128i*)digest;
+    v128_t* hash = (v128_t*)digest;
    int i;

    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi64x( 0, 0x80 ) );
+    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
+                                      v128_set_64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
    transform( sp );
    transform( sp );
    transform( sp );
@@ -251,15 +247,15 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
    for ( i = 0; i < sp->hashlen; i++ )
       hash[i] = sp->x[i];

-    return SUCCESS;
+    return 0;
 }

-int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
-                          const byte *data, size_t size )
+int cubehashUpdateDigest( cubehashParam *sp, void *digest,
+                          const void *data, size_t size )
 {
    const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    __m128i* hash = (__m128i*)digest;
+    const v128_t* in = (v128_t*)data;
+    v128_t* hash = (v128_t*)digest;
    int i;

    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -267,7 +263,7 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,

    for ( i = 0; i < len; i++ )
    {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
        sp->pos++;
        if ( sp->pos == sp->blocksize )
        {
@@ -277,11 +273,11 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
    }

    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi64x( 0, 0x80 ) );
+    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
+                                      v128_set_64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );

    transform( sp );
    transform( sp );
@@ -297,13 +293,13 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
    for ( i = 0; i < sp->hashlen; i++ )
       hash[i] = sp->x[i];

-    return SUCCESS;
+    return 0;
 }

-int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
-                          const byte *data, size_t size )
+int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen,
+                          const void *data, size_t size )
 {
-    __m128i *x = (__m128i*)sp->x;
+    v128_t *x = (v128_t*)sp->x;
    sp->hashlen   = hashbitlen/128;
    sp->blocksize = 32/16;
    sp->rounds    = 16;
@@ -312,33 +308,33 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
    if ( hashbitlen == 512 )
    {

-       x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
    }
    else
    {
-       x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
    }




    const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    __m128i* hash = (__m128i*)digest;
+    const v128_t* in = (v128_t*)data;
+    v128_t* hash = (v128_t*)digest;
    int i;

    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -346,7 +342,7 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,

    for ( i = 0; i < len; i++ )
    {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
        sp->pos++;
        if ( sp->pos == sp->blocksize )
        {
@@ -356,11 +352,11 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
    }

    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi64x( 0, 0x80 ) );
+    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
+                                      v128_set_64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );

    transform( sp );
    transform( sp );
@@ -376,6 +372,6 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
    for ( i = 0; i < sp->hashlen; i++ )
       hash[i] = sp->x[i];

-    return SUCCESS;
+    return 0;
 }

--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -3,11 +3,7 @@

 #include "compat.h"
 #include <stdint.h>
-#include "compat/sha3-defs.h"
-
-#define	OPTIMIZE_SSE2
-
-#include <emmintrin.h>
+#include "simd-utils.h"

 /*!\brief Holds all the parameters necessary for the CUBEHASH algorithm.
 * \ingroup HASH_cubehash_m
@@ -15,7 +11,7 @@

 struct _cubehashParam
 {
-    __m128i _ALIGN(64) x[8];  // aligned for __m512i
+    v128_t _ALIGN(64) x[8];  // aligned for __m512i
    int hashlen;           // __m128i
    int rounds;
    int blocksize;         // __m128i
@@ -32,15 +28,15 @@ int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
 // reinitialize context with same parameters, much faster.
 int cubehashReinit( cubehashParam* sp );

-int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);
+int cubehashUpdate(cubehashParam* sp, const void *data, size_t size);

-int cubehashDigest(cubehashParam* sp, byte *digest);
+int cubehashDigest(cubehashParam* sp, void *digest);

-int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
-                          size_t size );
+int cubehashUpdateDigest( cubehashParam *sp, void *digest,
+                          const void *data, size_t size );

-int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
-                   const byte *data, size_t size );
+int cubehash_full( cubehashParam* sp, void *digest, int hashbitlen,
+                   const void *data, size_t size );

 #ifdef __cplusplus
 }
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -13,6 +13,9 @@
 * Institute of Applied Mathematics, Middle East Technical University, Turkey.
 *
 */
+
+//TODO NEON support, funky shuffles
+
 #if defined(__AES__)

 #include <memory.h>
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -24,16 +24,16 @@

 #include "compat/sha3_common.h"

-#include <emmintrin.h>
+#include "simd-utils.h"


 typedef struct
 {
-	__m128i			state[4][4];
+	v128_t			state[4][4];
        BitSequence             buffer[192];
-	__m128i			k;
-	__m128i			hashsize;
-	__m128i			const1536;
+	v128_t			k;
+	v128_t			hashsize;
+	v128_t			const1536;

 	unsigned int	uRounds;
 	unsigned int	uHashSize;
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -9,13 +9,12 @@
 #ifndef __hash_h
 #define __hash_h

-#include <immintrin.h>
-
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
 #endif
 #include <stdlib.h>
+#include "simd-utils.h"

 #define LENGTH (512)

@@ -67,8 +66,8 @@ typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr
 #define SIZE512 (SIZE_1024/16)

 typedef struct {
-  __attribute__ ((aligned (64))) __m128i chaining[SIZE512];
-  __attribute__ ((aligned (64))) __m128i buffer[SIZE512];
+  __attribute__ ((aligned (64))) v128_t chaining[SIZE512];
+  __attribute__ ((aligned (64))) v128_t buffer[SIZE512];
  int hashlen;       // byte
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -9,7 +9,7 @@
 #ifndef __hash_h
 #define __hash_h

-#include <immintrin.h>
+#include "simd-utils.h"
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
@@ -91,8 +91,8 @@ typedef enum
 #define SIZE256 (SIZE_512/16)

 typedef struct {
-  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
-  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
+  __attribute__ ((aligned (32))) v128_t chaining[SIZE256];
+  __attribute__ ((aligned (32))) v128_t buffer[SIZE256];
  int hashlen;              // bytes
  int blk_count;
  int buf_ptr;              /* data buffer pointer */
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -10,7 +10,6 @@
 #define GROESTL256_HASH_4WAY_H__ 1

 #include "simd-utils.h"
-#include <immintrin.h>
 #include <stdint.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -2,7 +2,6 @@
 #define GROESTL512_HASH_4WAY_H__ 1

 #include "simd-utils.h"
-#include <immintrin.h>
 #include <stdint.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -211,7 +211,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );

--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -41,7 +41,7 @@ static void
 SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
 ( haval_4way_context *sc, const void *data, size_t len )
 {
-   __m128i *vdata = (__m128i*)data;
+   v128_t *vdata = (v128_t*)data;
   unsigned current;

   current = (unsigned)sc->count_low & 127U;
@@ -53,7 +53,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
      clen = 128U - current;
      if ( clen > len )
         clen = len;
-      memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
+      v128_memcpy( sc->buf + (current>>2), vdata, clen>>2 );
      vdata += clen>>2;
      current += clen;
      len -= clen;
@@ -88,7 +88,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
   RSTATE;
   if ( current > 116UL )
   {
-      memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      v128_memset_zero( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
      do
      {
         IN_PREPARE(sc->buf);
@@ -98,12 +98,12 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
   }

   uint32_t t1, t2;
-   memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   v128_memset_zero( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
   t1 = 0x01 | (PASSES << 3);
   t2 = sc->olen << 3;
-   sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
-   sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
-   sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
+   sc->buf[ 116>>2 ] = v128_32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = v128_32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = v128_32( (sc->count_high << 3)
                                     | (sc->count_low >> 29) );
   do
   {
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -38,11 +38,12 @@

 #include <stddef.h>
 #include <string.h>
+#include <stdint.h>
 #include "haval-hash-4way.h"

 // won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
 //#if defined (__SSE4_2__)
-#if defined(__AVX__)
+#if defined(__AVX__) || defined(__ARM_NEON)

 #ifdef __cplusplus
 extern "C"{
@@ -55,97 +56,97 @@ extern "C"{
 #if defined(__AVX512VL__)

 // ( ~( a ^ b ) ) & c
-#define mm128_andnotxor( a, b, c ) \
+#define v128_andnotxor( a, b, c ) \
   _mm_ternarylogic_epi32( a, b, c, 0x82  )

 #else

-#define mm128_andnotxor( a, b, c ) \
-   _mm_andnot_si128( _mm_xor_si128( a, b ), c )
+#define v128_andnotxor( a, b, c ) \
+   v128_andnot( v128_xor( a, b ), c )

 #endif

 #define F1(x6, x5, x4, x3, x2, x1, x0) \
- mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
-                 _mm_xor_si128( _mm_and_si128( x2, x5 ), \
-                                _mm_and_si128( x3, x6 ) ) ) \
+ v128_xor3( x0, v128_andxor( x1, x0, x4 ), \
+                 v128_xor( v128_and( x2, x5 ), \
+                                v128_and( x3, x6 ) ) ) \

 #define F2(x6, x5, x4, x3, x2, x1, x0) \
-   mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
-                       mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 )  ), \
-               mm128_andxor( x4, x1, x5 ), \
-               mm128_xorand( x0, x3, x5 ) ) \
+   v128_xor3( v128_andxor( x2, v128_andnot( x3, x1 ), \
+                       v128_xor3( v128_and( x4, x5 ), x6, x0 )  ), \
+               v128_andxor( x4, x1, x5 ), \
+               v128_xorand( x0, x3, x5 ) ) \

 #define F3(x6, x5, x4, x3, x2, x1, x0) \
-  mm128_xor3( x0, \
-              _mm_and_si128( x3, \
-                         mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
-              _mm_xor_si128( _mm_and_si128( x1, x4 ), \
-                             _mm_and_si128( x2, x5 ) ) )
+  v128_xor3( x0, \
+              v128_and( x3, \
+                         v128_xor3( v128_and( x1, x2 ), x6, x0 ) ), \
+              v128_xor( v128_and( x1, x4 ), \
+                             v128_and( x2, x5 ) ) )

 #define F4(x6, x5, x4, x3, x2, x1, x0) \
-  mm128_xor3( \
-      mm128_andxor( x3, x5, \
-                    _mm_xor_si128( _mm_and_si128( x1, x2 ), \
-                                      _mm_or_si128( x4, x6 ) ) ), \
-      _mm_and_si128( x4, \
-                        mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
-                                    _mm_xor_si128( x1, x6 ) ) ), \
-      mm128_xorand( x0, x2, x6 ) )
+  v128_xor3( \
+      v128_andxor( x3, x5, \
+                    v128_xor( v128_and( x1, x2 ), \
+                                      v128_or( x4, x6 ) ) ), \
+      v128_and( x4, \
+                        v128_xor3( x0, v128_andnot( x2, x5 ), \
+                                    v128_xor( x1, x6 ) ) ), \
+      v128_xorand( x0, x2, x6 ) )

 #define F5(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( \
-         mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
-         mm128_xor3( _mm_and_si128( x1, x4 ), \
-                     _mm_and_si128( x2, x5 ), \
-                     _mm_and_si128( x3, x6 ) ) )
+   v128_xor( \
+         v128_andnotxor( v128_and3( x1, x2, x3 ), x5, x0 ), \
+         v128_xor3( v128_and( x1, x4 ), \
+                     v128_and( x2, x5 ), \
+                     v128_and( x3, x6 ) ) )
  

 /*
 #define F1(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( x0, \
-       _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
-                      _mm_xor_si128( _mm_and_si128( x2, x5 ), \
-                                     _mm_and_si128( x3, x6 ) ) ) ) \
+   v128_xor( x0, \
+       v128_xor( v128_and(v128_xor( x0, x4 ), x1 ), \
+                      v128_xor( v128_and( x2, x5 ), \
+                                     v128_and( x3, x6 ) ) ) ) \

 #define F2(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( \
-      _mm_and_si128( x2, \
-         _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
-                        _mm_xor_si128( _mm_and_si128( x4, x5 ), \
-                                       _mm_xor_si128( x6, x0 ) ) ) ), \
-         _mm_xor_si128( \
-             _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
-             _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
+   v128_xor( \
+      v128_and( x2, \
+         v128_xor( v128_andnot( x3, x1 ), \
+                        v128_xor( v128_and( x4, x5 ), \
+                                       v128_xor( x6, x0 ) ) ) ), \
+         v128_xor( \
+             v128_and( x4, v128_xor( x1, x5 ) ), \
+             v128_xor( v128_and( x3, x5 ), x0 ) ) ) \

 #define F3(x6, x5, x4, x3, x2, x1, x0) \
-  _mm_xor_si128( \
-    _mm_and_si128( x3, \
-      _mm_xor_si128( _mm_and_si128( x1, x2 ), \
-                     _mm_xor_si128( x6, x0 ) ) ), \
-      _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
-                                   _mm_and_si128( x2, x5 ) ), x0 ) )
+  v128_xor( \
+    v128_and( x3, \
+      v128_xor( v128_and( x1, x2 ), \
+                     v128_xor( x6, x0 ) ) ), \
+      v128_xor( v128_xor(v128_and( x1, x4 ), \
+                                   v128_and( x2, x5 ) ), x0 ) )

 #define F4(x6, x5, x4, x3, x2, x1, x0) \
-  _mm_xor_si128( \
-     _mm_xor_si128( \
-        _mm_and_si128( x3, \
-           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
-                                         _mm_or_si128( x4, x6 ) ), x5 ) ), \
-        _mm_and_si128( x4, \
-           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \
-                          _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
-     _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
+  v128_xor( \
+     v128_xor( \
+        v128_and( x3, \
+           v128_xor( v128_xor( v128_and( x1, x2 ), \
+                                         v128_or( x4, x6 ) ), x5 ) ), \
+        v128_and( x4, \
+           v128_xor( v128_xor( v128_and( v128_not(x2), x5 ), \
+                          v128_xor( x1, x6 ) ), x0 ) ) ), \
+     v128_xor( v128_and( x2, x6 ), x0 ) )


 #define F5(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( \
-       _mm_and_si128( x0, \
-            mm128_not( _mm_xor_si128( \
-                    _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
-      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
-                                    _mm_and_si128( x2, x5 ) ), \
-                                    _mm_and_si128( x3, x6 ) ) )
+   v128_xor( \
+       v128_and( x0, \
+            v128_not( v128_xor( \
+                    v128_and( v128_and( x1, x2 ), x3 ), x5 ) ) ), \
+      v128_xor( v128_xor( v128_and( x1, x4 ), \
+                                    v128_and( x2, x5 ) ), \
+                                    v128_and( x3, x6 ) ) )
 */

 /*
@@ -186,17 +187,17 @@ extern "C"{
 */
 #define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
 do { \
-   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
-   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
-                                      mm128_ror_32( x7, 11 ) ), \
-                       _mm_add_epi32( w, v128_32( c ) ) ); \
+   v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
+                                      v128_ror32( x7, 11 ) ), \
+                       v128_add32( w, v128_32( c ) ) ); \
 } while (0)

 #define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
 do { \
-   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
-   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
-                                      mm128_ror_32( x7, 11 ) ), w ); \
+   v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
+                                      v128_ror32( x7, 11 ) ), w ); \
 } while (0)

 /*
@@ -371,7 +372,7 @@ static const uint32_t RK5[32] = {
 };

 #define SAVE_STATE \
-   __m128i u0, u1, u2, u3, u4, u5, u6, u7; \
+   v128_t u0, u1, u2, u3, u4, u5, u6, u7; \
   do { \
      u0 = s0; \
      u1 = s1; \
@@ -385,14 +386,14 @@ static const uint32_t RK5[32] = {

 #define UPDATE_STATE \
 do { \
-   s0 = _mm_add_epi32( s0, u0 ); \
-   s1 = _mm_add_epi32( s1, u1 ); \
-   s2 = _mm_add_epi32( s2, u2 ); \
-   s3 = _mm_add_epi32( s3, u3 ); \
-   s4 = _mm_add_epi32( s4, u4 ); \
-   s5 = _mm_add_epi32( s5, u5 ); \
-   s6 = _mm_add_epi32( s6, u6 ); \
-   s7 = _mm_add_epi32( s7, u7 ); \
+   s0 = v128_add32( s0, u0 ); \
+   s1 = v128_add32( s1, u1 ); \
+   s2 = v128_add32( s2, u2 ); \
+   s3 = v128_add32( s3, u3 ); \
+   s4 = v128_add32( s4, u4 ); \
+   s5 = v128_add32( s5, u5 ); \
+   s6 = v128_add32( s6, u6 ); \
+   s7 = v128_add32( s7, u7 ); \
 } while (0)

 /*
@@ -431,7 +432,7 @@ do { \
 /*
 * DSTATE declares the state variables "s0" to "s7".
 */
-#define DSTATE   __m128i s0, s1, s2, s3, s4, s5, s6, s7
+#define DSTATE   v128_t s0, s1, s2, s3, s4, s5, s6, s7

 /*
 * RSTATE fills the state variables from the context "sc".
@@ -486,7 +487,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
 	
 }

-#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
+#define IN_PREPARE(indata) const v128_t *const load_ptr = (indata)

 #define INW(i)   load_ptr[ i ] 

@@ -497,7 +498,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
 static void
 haval_4way_out( haval_4way_context *sc, void *dst )
 {
-   __m128i *buf = (__m128i*)dst;
+   v128_t *buf = (v128_t*)dst;
   DSTATE;
   RSTATE;

--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -61,7 +61,7 @@
 #ifndef HAVAL_HASH_4WAY_H__
 #define HAVAL_HASH_4WAY_H__ 1

-#if defined(__AVX__)
+#if defined(__AVX__) || defined(__ARM_NEON)

 #ifdef __cplusplus
 extern "C"{
@@ -73,8 +73,8 @@ extern "C"{
 #define SPH_SIZE_haval256_5   256

 typedef struct {
-   __m128i buf[32];
-   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+   v128_t buf[32];
+   v128_t s0, s1, s2, s3, s4, s5, s6, s7;
   unsigned olen, passes;
   uint32_t count_high, count_low;
 } haval_4way_context;
--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -1,10 +1,11 @@
 #include <stdint.h>
-#include <x86intrin.h>
-#include "wolf-aes.h"
 #include "miner.h"

 #if defined(__AES__)

+#include <x86intrin.h>
+#include "wolf-aes.h"
+
 static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
 {
    __m128i tmp4;
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,4 +1,5 @@
 #include <memory.h>
+#include <mm_malloc.h>
 #include <stdlib.h>

 #include "hodl-gate.h"
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -1,7 +1,7 @@
 #include <string.h>
 #include <openssl/evp.h>
 #include <openssl/sha.h>
-#include <x86intrin.h>
+#include "simd-utils.h"
 #include "sha512-avx.h"
 #include "wolf-aes.h"
 #include "hodl-gate.h"
--- a/algo/hodl/hodl-wolf.h
+++ b/algo/hodl/hodl-wolf.h
@@ -2,7 +2,7 @@
 #define __HODL_H

 #include <stdint.h>
-#include <x86intrin.h>
+#include "simd-utils.h"
 #include "miner.h"

 #define AES_ITERATIONS 		15
@@ -16,7 +16,7 @@
 typedef union _CacheEntry
 {
 	uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16)));
-	__m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
+	v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
 } CacheEntry;

 int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -2,7 +2,7 @@
 #define _SHA512_H

 #include <stdint.h>
-#include "emmintrin.h"
+#include "simd-utils.h"

 //SHA-512 block size
 #define SHA512_BLOCK_SIZE 128
@@ -24,8 +24,8 @@ typedef struct
   __m256i w[80];
 #elif defined(__SSE4_2__)
 //#elif defined(__AVX__)
-   __m128i h[8];
-   __m128i w[80];
+   v128_t h[8];
+   v128_t w[80];
 #else
   int dummy;
 #endif
--- a/algo/hodl/wolf-aes.h
+++ b/algo/hodl/wolf-aes.h
@@ -2,9 +2,9 @@
 #define __WOLF_AES_H

 #include <stdint.h>
-#include <x86intrin.h>
+#include "simd-utils.h"

-void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
+void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf);

 #if defined(__SSE4_2__)
 //#ifdef __AVX__
@@ -12,13 +12,13 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
 #define AES_PARALLEL_N 8
 #define BLOCK_COUNT 256

-void AES256CBC( __m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
-                __m128i* IV );
+void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16],
+                v128_t* IV );

 #else

-void AES256CBC( __m128i *Ciphertext, const __m128i *Plaintext,
-               const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount );
+void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext,
+               const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount );

 #endif

--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -8,10 +8,10 @@

 void keccakhash_8way(void *state, const void *input)
 {
-    keccak256_8way_context ctx;
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, input, 80 );
-    keccak256_8way_close( &ctx, state );
+    keccak256_8x64_context ctx;
+    keccak256_8x64_init( &ctx );
+    keccak256_8x64_update( &ctx, input, 80 );
+    keccak256_8x64_close( &ctx, state );
 }

 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
@@ -61,10 +61,10 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,

 void keccakhash_4way(void *state, const void *input)
 {
-    keccak256_4way_context ctx;
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, state );
+    keccak256_4x64_context ctx;
+    keccak256_4x64_init( &ctx );
+    keccak256_4x64_update( &ctx, input, 80 );
+    keccak256_4x64_close( &ctx, state );
 }

 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -207,30 +207,30 @@ void keccak256_8way_init( void *kc )
 }

 void
-keccak256_8way_update(void *cc, const void *data, size_t len)
+keccak256_8x64_update(void *cc, const void *data, size_t len)
 {
    keccak64_8way_core(cc, data, len, 136);
 }

 void
-keccak256_8way_close(void *cc, void *dst)
+keccak256_8x64_close(void *cc, void *dst)
 {
    keccak64_8way_close(cc, dst, 32, 136);
 }

-void keccak512_8way_init( void *kc )
+void keccak512_8x64_init( void *kc )
 {
   keccak64_8way_init( kc, 512 );
 }

 void
-keccak512_8way_update(void *cc, const void *data, size_t len)
+keccak512_8x64_update(void *cc, const void *data, size_t len)
 {
        keccak64_8way_core(cc, data, len, 72);
 }

 void
-keccak512_8way_close(void *cc, void *dst)
+keccak512_8x64_close(void *cc, void *dst)
 {
        keccak64_8way_close(cc, dst, 64, 72);
 }
@@ -395,24 +395,24 @@ void keccak256_4way_init( void *kc )
 }

 void
-keccak256_4way_update(void *cc, const void *data, size_t len)
+keccak256_4x64_update(void *cc, const void *data, size_t len)
 {
    keccak64_core(cc, data, len, 136);
 }

 void
-keccak256_4way_close(void *cc, void *dst)
+keccak256_4x64_close(void *cc, void *dst)
 {
    keccak64_close(cc, dst, 32, 136);
 }

-void keccak512_4way_init( void *kc )
+void keccak512_4x64_init( void *kc )
 {
   keccak64_init( kc, 512 );
 }

 void
-keccak512_4way_update(void *cc, const void *data, size_t len)
+keccak512_4x64_update(void *cc, const void *data, size_t len)
 {
   keccak64_core(cc, data, len, 72);
 }
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -1,64 +1,94 @@
 #ifndef KECCAK_HASH_4WAY_H__
 #define KECCAK_HASH_4WAY_H__

-#ifdef  __AVX2__
-
 #include <stddef.h>
 #include "simd-utils.h"

-/**
- * This structure is a context for Keccak computations: it contains the
- * intermediate values and some data from the last entered block. Once a
- * Keccak computation has been performed, the context can be reused for
- * another computation.
- *
- * The contents of this structure are private. A running Keccak computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-typedef struct {
-        __m512i buf[144*8];
-        __m512i w[25];
-        size_t ptr, lim;
+typedef struct
+{
+   __m512i buf[144*8];
+   __m512i w[25];
+   size_t ptr, lim;
 } keccak64_ctx_m512i __attribute__((aligned(128)));

-typedef keccak64_ctx_m512i keccak256_8way_context;
-typedef keccak64_ctx_m512i keccak512_8way_context;
+typedef keccak64_ctx_m512i keccak256_8x64_context;
+typedef keccak64_ctx_m512i keccak512_8x64_context;

-void keccak256_8way_init(void *cc);
-void keccak256_8way_update(void *cc, const void *data, size_t len);
-void keccak256_8way_close(void *cc, void *dst);
+void keccak256_8x64_init(void *cc);
+void keccak256_8x64_update(void *cc, const void *data, size_t len);
+void keccak256_8x64_close(void *cc, void *dst);

-void keccak512_8way_init(void *cc);
-void keccak512_8way_update(void *cc, const void *data, size_t len);
-void keccak512_8way_close(void *cc, void *dst);
-void keccak512_8way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);
+void keccak512_8x64_init(void *cc);
+void keccak512_8x64_update(void *cc, const void *data, size_t len);
+void keccak512_8x64_close(void *cc, void *dst);
+
+// legacy naming
+#define keccak512_8way_context keccak512_8x64_context
+#define keccak512_8way_init    keccak512_8x64_init
+#define keccak512_8way_update  keccak512_8x64_update
+#define keccak512_8way_close   keccak512_8x64_close
+#define keccak256_8way_context keccak256_8x64_context
+#define keccak256_8way_init    keccak256_8x64_init
+#define keccak256_8way_update  keccak256_8x64_update
+#define keccak256_8way_close   keccak256_8x64_close

 #endif   

-typedef struct {
-        __m256i buf[144*8];  
-        __m256i w[25];
-        size_t ptr, lim;
+#if defined(__AVX2__)
+
+typedef struct
+{
+   __m256i buf[144*8];  
+   __m256i w[25];
+   size_t ptr, lim;
 } keccak64_ctx_m256i __attribute__((aligned(128)));

-typedef keccak64_ctx_m256i keccak256_4way_context;
-typedef keccak64_ctx_m256i keccak512_4way_context;
+typedef keccak64_ctx_m256i keccak256_4x64_context;
+typedef keccak64_ctx_m256i keccak512_4x64_context;

-void keccak256_4way_init(void *cc);
-void keccak256_4way_update(void *cc, const void *data, size_t len);
-void keccak256_4way_close(void *cc, void *dst);
+void keccak256_4x64_init(void *cc);
+void keccak256_4x64_update(void *cc, const void *data, size_t len);
+void keccak256_4x64_close(void *cc, void *dst);

-void keccak512_4way_init(void *cc);
-void keccak512_4way_update(void *cc, const void *data, size_t len);
-void keccak512_4way_close(void *cc, void *dst);
-void keccak512_4way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);
+void keccak512_4x64_init(void *cc);
+void keccak512_4x64_update(void *cc, const void *data, size_t len);
+void keccak512_4x64_close(void *cc, void *dst);
+
+// legacy naming
+#define keccak512_4way_context keccak512_4x64_context
+#define keccak512_4way_init    keccak512_4x64_init
+#define keccak512_4way_update  keccak512_4x64_update
+#define keccak512_4way_close   keccak512_4x64_close
+#define keccak256_4way_context keccak256_4x64_context
+#define keccak256_4way_init    keccak256_4x64_init
+#define keccak256_4way_update  keccak256_4x64_update
+#define keccak256_4way_close   keccak256_4x64_close
+
+#endif
+
+#if defined(__SSE2__) || defined(__ARM_NEON)
+
+typedef struct
+{
+    v128_t buf[144*4];
+    v128_t w[50];
+    size_t ptr, lim;
+} keccak32_ctx_v128 __attribute__((aligned(64)));
+
+typedef keccak32_ctx_v128 keccak256_4x32_context;
+typedef keccak32_ctx_v128 keccak512_4x32_context;
+
+void keccak256_4x32_init(void *cc);
+void keccak256_4x32_update(void *cc, const void *data, size_t len);
+void keccak256_4x32_close(void *cc, void *dst);
+
+void keccak512_4x32_init(void *cc);
+void keccak512_4x32_update(void *cc, const void *data, size_t len);
+void keccak512_4x32_close(void *cc, void *dst);

 #endif

 #endif
+
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -11,13 +11,13 @@ void sha3d_hash_8way(void *state, const void *input)
    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
    keccak256_8way_context ctx;

-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, input, 80 );
-    keccak256_8way_close( &ctx, buffer );
+    keccak256_8x64_init( &ctx );
+    keccak256_8x64_update( &ctx, input, 80 );
+    keccak256_8x64_close( &ctx, buffer );

-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, buffer, 32 );
-    keccak256_8way_close( &ctx, state );
+    keccak256_8x64_init( &ctx );
+    keccak256_8x64_update( &ctx, buffer, 32 );
+    keccak256_8x64_close( &ctx, state );
 }

 int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
@@ -71,13 +71,13 @@ void sha3d_hash_4way(void *state, const void *input)
    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
    keccak256_4way_context ctx;

-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, buffer );
+    keccak256_4x64_init( &ctx );
+    keccak256_4x64_update( &ctx, input, 80 );
+    keccak256_4x64_close( &ctx, buffer );

-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, buffer, 32 );
-    keccak256_4way_close( &ctx, state );
+    keccak256_4x64_init( &ctx );
+    keccak256_4x64_update( &ctx, buffer, 32 );
+    keccak256_4x64_close( &ctx, state );
 }

 int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -1,5 +1,4 @@
 #include <string.h>
-#include <immintrin.h>
 #include "luffa-hash-2way.h"
 #include <stdio.h>

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -22,18 +22,18 @@
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

-#define cns(i)  ( ( (__m128i*)CNS_INIT)[i] )
+#define cns(i)  ( ( (v128_t*)CNS_INIT)[i] )

 #define ADD_CONSTANT( a, b, c0 ,c1 ) \
-    a = _mm_xor_si128( a, c0 ); \
-    b = _mm_xor_si128( b, c1 ); \
+    a = v128_xor( a, c0 ); \
+    b = v128_xor( b, c1 ); \

 #if defined(__AVX512VL__)
 //TODO enable for AVX10_512 AVX10_256

 #define MULT2( a0, a1 ) \
 { \
-  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
+  v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
  a0 = _mm_alignr_epi8( a1, b, 4 ); \
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }
@@ -42,20 +42,35 @@

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, \
+  v128_t b = v128_xor( a0, \
                      _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
  a0 = _mm_alignr_epi8( a1, b, 4 ); \
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 } while(0)

-#else
+#elif defined(__ARM_NEON)
+
+#pragma message "NEON for Luffa"
+
+const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
+
+// { a1_0, 0, a1_0, a1_0 }
+#define MULT2( a0, a1 ) \
+{ \
+  v128_t b = v128_xor( a0, \
+           v128_and( v128_32( vgetq_lane_u32( a1, 0 ) ), mask ) ); \
+  a0 = v128_alignr32( a1, b, 1 ); \
+  a1 = v128_alignr32( b, a1, 1 ); \
+}
+
+#else   // assume SSE2

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, \
-                      _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
+  v128_t b = v128_xor( a0, \
+                      _mm_shuffle_epi32( v128_and( a1, MASK ), 0x10 ) ); \
+  a0 = v128_or( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
 } while(0)

 #endif
@@ -65,16 +80,16 @@

 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
-    __m128i t = a0; \
+    v128_t t = a0; \
    a0 = mm128_xoror( a3, a0, a1 ); \
-    a2 = _mm_xor_si128( a2, a3 ); \
+    a2 = v128_xor( a2, a3 ); \
    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
    a3 = mm128_xorand( a2, a3, t ); \
    a2 = mm128_xorand( a1, a2, a0 ); \
-    a1 = _mm_or_si128( a1, a3 ); \
-    a3 = _mm_xor_si128( a3, a2 ); \
-    t  = _mm_xor_si128( t, a1 ); \
-    a2 = _mm_and_si128( a2, a1 ); \
+    a1 = v128_or( a1, a3 ); \
+    a3 = v128_xor( a3, a2 ); \
+    t  = v128_xor( t, a1 ); \
+    a2 = v128_and( a2, a1 ); \
    a1 = mm128_xnor( a1, a0 ); \
    a0 = t; \
 }
@@ -83,33 +98,33 @@

 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
-    __m128i t = a0; \
-    a0 = _mm_or_si128( a0, a1 ); \
-    a2 = _mm_xor_si128( a2, a3 ); \
-    a1 = mm128_not( a1 ); \
-    a0 = _mm_xor_si128( a0, a3 ); \
-    a3 = _mm_and_si128( a3, t ); \
-    a1 = _mm_xor_si128( a1, a3 ); \
-    a3 = _mm_xor_si128( a3, a2 ); \
-    a2 = _mm_and_si128( a2, a0 ); \
-    a0 = mm128_not( a0 ); \
-    a2 = _mm_xor_si128( a2, a1 ); \
-    a1 = _mm_or_si128(  a1, a3 ); \
-    t  = _mm_xor_si128( t , a1 ); \
-    a3 = _mm_xor_si128( a3, a2 ); \
-    a2 = _mm_and_si128( a2, a1 ); \
-    a1 = _mm_xor_si128( a1, a0 ); \
+    v128_t t = a0; \
+    a0 = v128_or( a0, a1 ); \
+    a2 = v128_xor( a2, a3 ); \
+    a1 = v128_not( a1 ); \
+    a0 = v128_xor( a0, a3 ); \
+    a3 = v128_and( a3, t ); \
+    a1 = v128_xor( a1, a3 ); \
+    a3 = v128_xor( a3, a2 ); \
+    a2 = v128_and( a2, a0 ); \
+    a0 = v128_not( a0 ); \
+    a2 = v128_xor( a2, a1 ); \
+    a1 = v128_or(  a1, a3 ); \
+    t  = v128_xor( t , a1 ); \
+    a3 = v128_xor( a3, a2 ); \
+    a2 = v128_and( a2, a1 ); \
+    a1 = v128_xor( a1, a0 ); \
    a0 = t; \
 }

 #endif

 #define MIXWORD( a, b ) \
-    b = _mm_xor_si128( a, b ); \
-    a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \
-    b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \
-    a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \
-    b = mm128_rol_32( b, 1 );
+    b = v128_xor( a, b ); \
+    a = v128_xor( b, v128_rol32( a, 2 ) ); \
+    b = v128_xor( a, v128_rol32( b, 14 ) ); \
+    a = v128_xor( b, v128_rol32( a, 10 ) ); \
+    b = v128_rol32( b, 1 );

 #define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
    SUBCRUMB( x0, x1, x2, x3 ); \
@@ -121,105 +136,47 @@
    ADD_CONSTANT( x0, x4, c0, c1 );

 #define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
-    t0 = _mm_shuffle_epi32( a1, 147 ); \
-    a1 = _mm_unpacklo_epi32( t0, a0 ); \
-    t0 = _mm_unpackhi_epi32( t0, a0 ); \
-    t1 = _mm_shuffle_epi32( t0, 78 ); \
-    a0 = _mm_shuffle_epi32( a1, 78 ); \
+    t0 = v128_shufll32( a1 ); \
+    a1 = v128_unpacklo32( t0, a0 ); \
+    t0 = v128_unpackhi32( t0, a0 ); \
+    t1 = v128_swap64( t0 ); \
+    a0 = v128_swap64( a1 ); \
    SUBCRUMB( t1, t0, a0, a1 ); \
-    t0 = _mm_unpacklo_epi32( t0, t1 ); \
-    a1 = _mm_unpacklo_epi32( a1, a0 ); \
-    a0 = _mm_unpackhi_epi64( a1, t0 ); \
-    a1 = _mm_unpacklo_epi64( a1, t0 ); \
-    a1 = _mm_shuffle_epi32( a1, 57 ); \
+    t0 = v128_unpacklo32( t0, t1 ); \
+    a1 = v128_unpacklo32( a1, a0 ); \
+    a0 = v128_unpackhi64( a1, t0 ); \
+    a1 = v128_unpacklo64( a1, t0 ); \
+    a1 = v128_shuflr32( a1 ); \
    MIXWORD( a0, a1 ); \
    ADD_CONSTANT( a0, a1, c0, c1 );

-#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
-    s2 = _mm_load_si128(&r1);\
-    q2 = _mm_load_si128(&p1);\
-    r2 = _mm_shuffle_epi32(r2,216);\
-    p2 = _mm_shuffle_epi32(p2,216);\
-    r1 = _mm_unpacklo_epi32(r1,r0);\
-    p1 = _mm_unpacklo_epi32(p1,p0);\
-    s2 = _mm_unpackhi_epi32(s2,r0);\
-    q2 = _mm_unpackhi_epi32(q2,p0);\
-    s0 = _mm_load_si128(&r2);\
-    q0 = _mm_load_si128(&p2);\
-    r2 = _mm_unpacklo_epi64(r2,r1);\
-    p2 = _mm_unpacklo_epi64(p2,p1);\
-    s1 = _mm_load_si128(&s0);\
-    q1 = _mm_load_si128(&q0);\
-    s0 = _mm_unpackhi_epi64(s0,r1);\
-    q0 = _mm_unpackhi_epi64(q0,p1);\
-    r2 = _mm_shuffle_epi32(r2,225);\
-    p2 = _mm_shuffle_epi32(p2,225);\
-    r0 = _mm_load_si128(&s1);\
-    p0 = _mm_load_si128(&q1);\
-    s0 = _mm_shuffle_epi32(s0,225);\
-    q0 = _mm_shuffle_epi32(q0,225);\
-    s1 = _mm_unpacklo_epi64(s1,s2);\
-    q1 = _mm_unpacklo_epi64(q1,q2);\
-    r0 = _mm_unpackhi_epi64(r0,s2);\
-    p0 = _mm_unpackhi_epi64(p0,q2);\
-    s2 = _mm_load_si128(&r0);\
-    q2 = _mm_load_si128(&p0);\
-    s3 = _mm_load_si128(&r2);\
-    q3 = _mm_load_si128(&p2);\
-
-#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
-    s0 = _mm_load_si128(&r0);\
-    q0 = _mm_load_si128(&p0);\
-    s1 = _mm_load_si128(&r2);\
-    q1 = _mm_load_si128(&p2);\
-    r0 = _mm_unpackhi_epi32(r0,r1);\
-    p0 = _mm_unpackhi_epi32(p0,p1);\
-    r2 = _mm_unpackhi_epi32(r2,r3);\
-    p2 = _mm_unpackhi_epi32(p2,p3);\
-    s0 = _mm_unpacklo_epi32(s0,r1);\
-    q0 = _mm_unpacklo_epi32(q0,p1);\
-    s1 = _mm_unpacklo_epi32(s1,r3);\
-    q1 = _mm_unpacklo_epi32(q1,p3);\
-    r1 = _mm_load_si128(&r0);\
-    p1 = _mm_load_si128(&p0);\
-    r0 = _mm_unpackhi_epi64(r0,r2);\
-    p0 = _mm_unpackhi_epi64(p0,p2);\
-    s0 = _mm_unpackhi_epi64(s0,s1);\
-    q0 = _mm_unpackhi_epi64(q0,q1);\
-    r1 = _mm_unpacklo_epi64(r1,r2);\
-    p1 = _mm_unpacklo_epi64(p1,p2);\
-    s2 = _mm_load_si128(&r0);\
-    q2 = _mm_load_si128(&p0);\
-    s1 = _mm_load_si128(&r1);\
-    q1 = _mm_load_si128(&p1);\
-
 #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
-    s1 = _mm_unpackhi_epi32( r3, r2 ); \
-    q1 = _mm_unpackhi_epi32( p3, p2 ); \
-    s3 = _mm_unpacklo_epi32( r3, r2 ); \
-    q3 = _mm_unpacklo_epi32( p3, p2 ); \
-    r3 = _mm_unpackhi_epi32( r1, r0 ); \
-    r1 = _mm_unpacklo_epi32( r1, r0 ); \
-    p3 = _mm_unpackhi_epi32( p1, p0 ); \
-    p1 = _mm_unpacklo_epi32( p1, p0 ); \
-    s0 = _mm_unpackhi_epi64( s1, r3 ); \
-    q0 = _mm_unpackhi_epi64( q1 ,p3 ); \
-    s1 = _mm_unpacklo_epi64( s1, r3 ); \
-    q1 = _mm_unpacklo_epi64( q1, p3 ); \
-    s2 = _mm_unpackhi_epi64( s3, r1 ); \
-    q2 = _mm_unpackhi_epi64( q3, p1 ); \
-    s3 = _mm_unpacklo_epi64( s3, r1 ); \
-    q3 = _mm_unpacklo_epi64( q3, p1 );
+    s1 = v128_unpackhi32( r3, r2 ); \
+    q1 = v128_unpackhi32( p3, p2 ); \
+    s3 = v128_unpacklo32( r3, r2 ); \
+    q3 = v128_unpacklo32( p3, p2 ); \
+    r3 = v128_unpackhi32( r1, r0 ); \
+    r1 = v128_unpacklo32( r1, r0 ); \
+    p3 = v128_unpackhi32( p1, p0 ); \
+    p1 = v128_unpacklo32( p1, p0 ); \
+    s0 = v128_unpackhi64( s1, r3 ); \
+    q0 = v128_unpackhi64( q1 ,p3 ); \
+    s1 = v128_unpacklo64( s1, r3 ); \
+    q1 = v128_unpacklo64( q1, p3 ); \
+    s2 = v128_unpackhi64( s3, r1 ); \
+    q2 = v128_unpackhi64( q3, p1 ); \
+    s3 = v128_unpacklo64( s3, r1 ); \
+    q3 = v128_unpacklo64( q3, p1 );

 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);

-static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
+static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 );

-static void finalization512( hashState_luffa *state, uint32 *b );
+static void finalization512( hashState_luffa *state, uint32_t *b );

 /* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(16))) = {
+static const uint32_t IV[40] __attribute((aligned(16))) = {
    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
@@ -233,7 +190,7 @@ static const uint32 IV[40] __attribute((aligned(16))) = {
 };

 /* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
+static const uint32_t CNS_INIT[128] __attribute((aligned(16))) = {
    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
@@ -269,29 +226,29 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
 };


-__m128i CNS128[32];
+v128_t CNS128[32];
 #if !defined(__SSE4_1__)
-__m128i MASK;
+v128_t MASK;
 #endif

-HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
+int init_luffa(hashState_luffa *state, int hashbitlen)
 {
    int i;
    state->hashbitlen = hashbitlen;
 #if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
-    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
+    MASK = v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
 #endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
-        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
+        CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
    for ( i=0; i<10; i++ ) 
-	state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
+	state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
    memset(state->buffer, 0, sizeof state->buffer );
-    return SUCCESS;
+    return 0;
 }

-HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
+int update_luffa( hashState_luffa *state, const void *data,
                         size_t len )
 {
    int i;
@@ -301,8 +258,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
+                      v128_bswap32( casti_v128( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -311,37 +268,37 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
+      casti_v128( state->buffer, 0 ) = v128_bswap32( cast_v128( data ) );
      // padding of partial block
-      casti_m128i( state->buffer, 1 ) =  _mm_set_epi32( 0, 0, 0, 0x80000000 );
+      casti_v128( state->buffer, 1 ) =  v128_set32( 0, 0, 0, 0x80000000 );
    }

-    return SUCCESS;
+    return 0;
 }

-HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval) 
+int final_luffa(hashState_luffa *state, void *hashval) 
 {
    // transform pad block
    if ( state->rembytes )
    {
      // not empty, data is in buffer
-      rnd512( state, casti_m128i( state->buffer, 1 ),
-                     casti_m128i( state->buffer, 0 ) );
+      rnd512( state, casti_v128( state->buffer, 1 ),
+                     casti_v128( state->buffer, 0 ) );
    }
    else
    {
      // empty pad block, constant data
-     rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) );
+     rnd512( state, v128_zero, v128_set32( 0, 0, 0, 0x80000000 ) );
    }

-    finalization512(state, (uint32*) hashval);
+    finalization512(state, (uint32_t*) hashval);
    if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( hashval+128 ) );
-    return SUCCESS;
+        finalization512( state, (uint32_t*)( hashval+128 ) );
+    return 0;
 }

-HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
-              const BitSequence* data, size_t inlen )
+int update_and_final_luffa( hashState_luffa *state, void* output,
+              const void* data, size_t inlen )
 {
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
    int i;
@@ -351,43 +308,43 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
+                      v128_bswap32( casti_v128( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
       // padding of partial block
-       rnd512( state, mm128_mov64_128(  0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
+       rnd512( state, v128_mov64(  0x80000000 ),
+                      v128_bswap32( cast_v128( data ) ) );
    else
       // empty pad block
-       rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
+       rnd512( state, v128_zero, v128_64( 0x80000000 ) );

-    finalization512( state, (uint32*) output );
+    finalization512( state, (uint32_t*) output );
    if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( output+128 ) );
+        finalization512( state, (uint32_t*)( output+128 ) );

-    return SUCCESS;
+    return 0;
 }


-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-              const BitSequence* data, size_t inlen )
+int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
+              const void* data, size_t inlen )
 {
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
    int i;
    state->hashbitlen = hashbitlen;
 #if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
-    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
+    MASK= v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
 #endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
-        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
+        CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
    for ( i=0; i<10; i++ )
-    state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
+    state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
    memset(state->buffer, 0, sizeof state->buffer );

    // update
@@ -398,8 +355,8 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
+                      v128_bswap32( casti_v128( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -408,17 +365,17 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
       // padding of partial block
-       rnd512( state, mm128_mov64_128( 0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
+       rnd512( state, v128_mov64( 0x80000000 ),
+                      v128_bswap32( cast_v128( data ) ) );
    else
       // empty pad block
-       rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
+       rnd512( state, v128_zero, v128_mov64( 0x80000000 ) );

-    finalization512( state, (uint32*) output );
+    finalization512( state, (uint32_t*) output );
    if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( output+128 ) );
+        finalization512( state, (uint32_t*)( output+128 ) );

-    return SUCCESS;
+    return 0;
 }


@@ -426,97 +383,97 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
 /* Round function         */
 /* state: hash context    */

-static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
+static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
 {
-    __m128i t0, t1;
-    __m128i *chainv = state->chainv;
-    __m128i x0, x1, x2, x3, x4, x5, x6, x7; 
+    v128_t t0, t1;
+    v128_t *chainv = state->chainv;
+    v128_t x0, x1, x2, x3, x4, x5, x6, x7; 

-    t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] );
-    t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] );
-    t0 = mm128_xor3( t0, chainv[6], chainv[8] );
-    t1 = mm128_xor3( t1, chainv[7], chainv[9] );
+    t0 = v128_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = v128_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = v128_xor3( t0, chainv[6], chainv[8] );
+    t1 = v128_xor3( t1, chainv[7], chainv[9] );

    MULT2( t0, t1 );

-    msg0 = _mm_shuffle_epi32( msg0, 27 );
-    msg1 = _mm_shuffle_epi32( msg1, 27 );
+    msg0 = v128_rev32( msg0 );
+    msg1 = v128_rev32( msg1 );

-    chainv[0] = _mm_xor_si128( chainv[0], t0 );
-    chainv[1] = _mm_xor_si128( chainv[1], t1 );
-    chainv[2] = _mm_xor_si128( chainv[2], t0 );
-    chainv[3] = _mm_xor_si128( chainv[3], t1 );
-    chainv[4] = _mm_xor_si128( chainv[4], t0 );
-    chainv[5] = _mm_xor_si128( chainv[5], t1 );
-    chainv[6] = _mm_xor_si128( chainv[6], t0 );
-    chainv[7] = _mm_xor_si128( chainv[7], t1 );
-    chainv[8] = _mm_xor_si128( chainv[8], t0 );
-    chainv[9] = _mm_xor_si128( chainv[9], t1 );
+    chainv[0] = v128_xor( chainv[0], t0 );
+    chainv[1] = v128_xor( chainv[1], t1 );
+    chainv[2] = v128_xor( chainv[2], t0 );
+    chainv[3] = v128_xor( chainv[3], t1 );
+    chainv[4] = v128_xor( chainv[4], t0 );
+    chainv[5] = v128_xor( chainv[5], t1 );
+    chainv[6] = v128_xor( chainv[6], t0 );
+    chainv[7] = v128_xor( chainv[7], t1 );
+    chainv[8] = v128_xor( chainv[8], t0 );
+    chainv[9] = v128_xor( chainv[9], t1 );

    t0 = chainv[0];
    t1 = chainv[1];

    MULT2( chainv[0], chainv[1]);
-    chainv[0] = _mm_xor_si128( chainv[0], chainv[2] );
-    chainv[1] = _mm_xor_si128( chainv[1], chainv[3] );
+    chainv[0] = v128_xor( chainv[0], chainv[2] );
+    chainv[1] = v128_xor( chainv[1], chainv[3] );

    MULT2( chainv[2], chainv[3]);
-    chainv[2] = _mm_xor_si128(chainv[2], chainv[4]);
-    chainv[3] = _mm_xor_si128(chainv[3], chainv[5]);
+    chainv[2] = v128_xor(chainv[2], chainv[4]);
+    chainv[3] = v128_xor(chainv[3], chainv[5]);

    MULT2( chainv[4], chainv[5]);
-    chainv[4] = _mm_xor_si128(chainv[4], chainv[6]);
-    chainv[5] = _mm_xor_si128(chainv[5], chainv[7]);
+    chainv[4] = v128_xor(chainv[4], chainv[6]);
+    chainv[5] = v128_xor(chainv[5], chainv[7]);

    MULT2( chainv[6], chainv[7]);
-    chainv[6] = _mm_xor_si128(chainv[6], chainv[8]);
-    chainv[7] = _mm_xor_si128(chainv[7], chainv[9]);
+    chainv[6] = v128_xor(chainv[6], chainv[8]);
+    chainv[7] = v128_xor(chainv[7], chainv[9]);

    MULT2( chainv[8], chainv[9]);
-    t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 );
-    t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 );
+    t0 = chainv[8] = v128_xor( chainv[8], t0 );
+    t1 = chainv[9] = v128_xor( chainv[9], t1 );

    MULT2( chainv[8], chainv[9]);
-    chainv[8] = _mm_xor_si128( chainv[8], chainv[6] );
-    chainv[9] = _mm_xor_si128( chainv[9], chainv[7] );
+    chainv[8] = v128_xor( chainv[8], chainv[6] );
+    chainv[9] = v128_xor( chainv[9], chainv[7] );

    MULT2( chainv[6], chainv[7]);
-    chainv[6] = _mm_xor_si128( chainv[6], chainv[4] );
-    chainv[7] = _mm_xor_si128( chainv[7], chainv[5] );
+    chainv[6] = v128_xor( chainv[6], chainv[4] );
+    chainv[7] = v128_xor( chainv[7], chainv[5] );

    MULT2( chainv[4], chainv[5]);
-    chainv[4] = _mm_xor_si128( chainv[4], chainv[2] );
-    chainv[5] = _mm_xor_si128( chainv[5], chainv[3] );
+    chainv[4] = v128_xor( chainv[4], chainv[2] );
+    chainv[5] = v128_xor( chainv[5], chainv[3] );

    MULT2( chainv[2], chainv[3] );
-    chainv[2] = _mm_xor_si128( chainv[2], chainv[0] );
-    chainv[3] = _mm_xor_si128( chainv[3], chainv[1] );
+    chainv[2] = v128_xor( chainv[2], chainv[0] );
+    chainv[3] = v128_xor( chainv[3], chainv[1] );

    MULT2( chainv[0], chainv[1] );
-    chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 );
-    chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 );
+    chainv[0] = v128_xor( v128_xor( chainv[0], t0 ), msg0 );
+    chainv[1] = v128_xor( v128_xor( chainv[1], t1 ), msg1 );

    MULT2( msg0, msg1);
-    chainv[2] = _mm_xor_si128( chainv[2], msg0 );
-    chainv[3] = _mm_xor_si128( chainv[3], msg1 );
+    chainv[2] = v128_xor( chainv[2], msg0 );
+    chainv[3] = v128_xor( chainv[3], msg1 );

    MULT2( msg0, msg1);
-    chainv[4] = _mm_xor_si128( chainv[4], msg0 );
-    chainv[5] = _mm_xor_si128( chainv[5], msg1 );
+    chainv[4] = v128_xor( chainv[4], msg0 );
+    chainv[5] = v128_xor( chainv[5], msg1 );

    MULT2( msg0, msg1);
-    chainv[6] = _mm_xor_si128( chainv[6], msg0 );
-    chainv[7] = _mm_xor_si128( chainv[7], msg1 );
+    chainv[6] = v128_xor( chainv[6], msg0 );
+    chainv[7] = v128_xor( chainv[7], msg1 );

    MULT2( msg0, msg1);
-    chainv[8] = _mm_xor_si128( chainv[8], msg0 );
-    chainv[9] = _mm_xor_si128( chainv[9], msg1 );
+    chainv[8] = v128_xor( chainv[8], msg0 );
+    chainv[9] = v128_xor( chainv[9], msg1 );

    MULT2( msg0, msg1);
-    chainv[3] = mm128_rol_32( chainv[3], 1 );    
-    chainv[5] = mm128_rol_32( chainv[5], 2 );
-    chainv[7] = mm128_rol_32( chainv[7], 3 );
-    chainv[9] = mm128_rol_32( chainv[9], 4 );
+    chainv[3] = v128_rol32( chainv[3], 1 );    
+    chainv[5] = v128_rol32( chainv[5], 2 );
+    chainv[7] = v128_rol32( chainv[7], 3 );
+    chainv[9] = v128_rol32( chainv[9], 4 );
    
    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
                chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
@@ -549,57 +506,57 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 /* state: hash context    */
 /* b[8]: hash values      */

-static void finalization512( hashState_luffa *state, uint32 *b )
+static void finalization512( hashState_luffa *state, uint32_t *b )
 {
-    uint32 hash[8] __attribute((aligned(64)));
-    __m128i* chainv = state->chainv;
-    __m128i t[2];
-    const __m128i zero = _mm_setzero_si128();
+    uint32_t hash[8] __attribute((aligned(64)));
+    v128_t* chainv = state->chainv;
+    v128_t t[2];
+    const v128_t zero = v128_zero;

    /*---- blank round with m=0 ----*/
    rnd512( state, zero, zero );

    t[0] = chainv[0];
    t[1] = chainv[1];
-    t[0] = _mm_xor_si128(t[0], chainv[2]);
-    t[1] = _mm_xor_si128(t[1], chainv[3]);
-    t[0] = _mm_xor_si128(t[0], chainv[4]);
-    t[1] = _mm_xor_si128(t[1], chainv[5]);
-    t[0] = _mm_xor_si128(t[0], chainv[6]);
-    t[1] = _mm_xor_si128(t[1], chainv[7]);
-    t[0] = _mm_xor_si128(t[0], chainv[8]);
-    t[1] = _mm_xor_si128(t[1], chainv[9]);
+    t[0] = v128_xor(t[0], chainv[2]);
+    t[1] = v128_xor(t[1], chainv[3]);
+    t[0] = v128_xor(t[0], chainv[4]);
+    t[1] = v128_xor(t[1], chainv[5]);
+    t[0] = v128_xor(t[0], chainv[6]);
+    t[1] = v128_xor(t[1], chainv[7]);
+    t[0] = v128_xor(t[0], chainv[8]);
+    t[1] = v128_xor(t[1], chainv[9]);

-    t[0] = _mm_shuffle_epi32(t[0], 27);
-    t[1] = _mm_shuffle_epi32(t[1], 27);
+    t[0] = v128_rev32( t[0] );
+    t[1] = v128_rev32( t[1] );

-    _mm_store_si128((__m128i*)&hash[0], t[0]);
-    _mm_store_si128((__m128i*)&hash[4], t[1]);
+    v128_store((v128_t*)&hash[0], t[0]);
+    v128_store((v128_t*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
+    casti_v128( b, 0 ) = v128_bswap32( casti_v128( hash, 0 ) );
+    casti_v128( b, 1 ) = v128_bswap32( casti_v128( hash, 1 ) );

    rnd512( state, zero, zero );

    t[0] = chainv[0];
    t[1] = chainv[1];
-    t[0] = _mm_xor_si128(t[0], chainv[2]);
-    t[1] = _mm_xor_si128(t[1], chainv[3]);
-    t[0] = _mm_xor_si128(t[0], chainv[4]);
-    t[1] = _mm_xor_si128(t[1], chainv[5]);
-    t[0] = _mm_xor_si128(t[0], chainv[6]);
-    t[1] = _mm_xor_si128(t[1], chainv[7]);
-    t[0] = _mm_xor_si128(t[0], chainv[8]);
-    t[1] = _mm_xor_si128(t[1], chainv[9]);
+    t[0] = v128_xor(t[0], chainv[2]);
+    t[1] = v128_xor(t[1], chainv[3]);
+    t[0] = v128_xor(t[0], chainv[4]);
+    t[1] = v128_xor(t[1], chainv[5]);
+    t[0] = v128_xor(t[0], chainv[6]);
+    t[1] = v128_xor(t[1], chainv[7]);
+    t[0] = v128_xor(t[0], chainv[8]);
+    t[1] = v128_xor(t[1], chainv[9]);

-    t[0] = _mm_shuffle_epi32(t[0], 27);
-    t[1] = _mm_shuffle_epi32(t[1], 27);
+    t[0] = v128_rev32( t[0] );
+    t[1] = v128_rev32( t[1] );

-    _mm_store_si128((__m128i*)&hash[0], t[0]);
-    _mm_store_si128((__m128i*)&hash[4], t[1]);
+    casti_v128( hash, 0 ) = t[0];
+    casti_v128( hash, 1 ) = t[1];

-    casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
+    casti_v128( b, 2 ) = v128_bswap32( casti_v128( hash, 0 ) );
+    casti_v128( b, 3 ) = v128_bswap32( casti_v128( hash, 1 ) );
 }

 /***************************************************/
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -21,8 +21,8 @@
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

-#include <emmintrin.h>
-#include "compat/sha3-defs.h"
+//#include <emmintrin.h>
+//#include "compat/sha3-defs.h"
 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
 #define DIGEST_BIT_LEN_256 256
@@ -49,23 +49,23 @@
 /*********************************/

 typedef struct {
-    uint32 buffer[8] __attribute((aligned(32)));
-    __m128i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    uint32_t buffer[8] __attribute((aligned(32)));
+    v128_t chainv[10] __attribute((aligned(32)));   /* Chaining values */
    int hashbitlen;
    int rembytes;
 } hashState_luffa;

-HashReturn init_luffa( hashState_luffa *state, int hashbitlen );
+int init_luffa( hashState_luffa *state, int hashbitlen );

 // len is in bytes
-HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
+int update_luffa( hashState_luffa *state, const void *data,
                         size_t len );

-HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
+int final_luffa( hashState_luffa *state, void *hashval );

-HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
-                                   const BitSequence* data, size_t inlen );
+int update_and_final_luffa( hashState_luffa *state, void* output,
+                                   const void* data, size_t inlen );

-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-                                   const BitSequence* data, size_t inlen );
+int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
+                                   const void* data, size_t inlen );
 #endif   // LUFFA_FOR_SSE2_H___
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -1,5 +1,5 @@
 #include "lyra2-gate.h"
-
+#include <mm_malloc.h>

 // huge pages
 //
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -63,7 +63,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
   lyra2h_4way_midstate( vdata );

   do {
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -353,9 +353,6 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
   return 0;
 }

-#endif
-
-/*
 #elif defined (LYRA2REV2_4WAY)

 typedef struct {
@@ -452,7 +449,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
@@ -480,4 +477,4 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
 }

 #endif
-*/
+
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -371,7 +371,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );

   blake256_4way_init( &l2v3_4way_ctx.blake );
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -75,11 +75,11 @@ int scanhash_lyra2rev3( struct work *work,
 	((uint32_t*)ptarget)[7] = 0x0000ff;

   // need big endian data
-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_v128( endiandata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) );
+   casti_v128( endiandata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) );
+   casti_v128( endiandata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) );
+   casti_v128( endiandata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) );
+   casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
   l2v3_blake256_midstate( endiandata );
   do
   {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -312,7 +312,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0000ff;

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );
   lyra2z_4way_midstate( vdata );

--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -53,7 +53,6 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
   int thr_id = mythr->id; 
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -2,6 +2,7 @@
 #include "algo-gate-api.h"
 #include "lyra2.h"
 #include "simd-utils.h"
+#include <mm_malloc.h>

 static __thread uint64_t* lyra2z330_wholeMatrix;

@@ -29,11 +30,11 @@ int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
   if (opt_benchmark)
 	ptarget[7] = 0x0000ff;

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_v128( edata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) );
+   casti_v128( edata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) );
+   casti_v128( edata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) );
+   casti_v128( edata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) );
+   casti_v128( edata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
   
   do
   {
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -23,9 +23,9 @@
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
-#include <immintrin.h>
 #include "sponge.h"
 #include "lyra2.h"
+#include "simd-utils.h"

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -22,7 +22,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
-#include <immintrin.h>
+#include "simd-utils.h"
 #include "sponge.h"
 #include "lyra2.h"

--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -195,7 +195,7 @@ static const uint64_t blake2b_IV[8] =

 #endif // AVX2 else SSE2

-/*
+
 // Scalar, not used.

 static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
@@ -223,7 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
-*/
+

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -42,7 +42,7 @@ do { \
 //
 //    Panama-256 4 way SSE2

-#define LVAR17_4W(b)  __m128i \
+#define LVAR17_4W(b)  v128_t \
 	b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
 	b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
 	b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
@@ -53,9 +53,9 @@ do { \

 #define BUPDATE1_4W( n0, n2 ) \
 do { \
-   sc->buffer[ptr24][n0] = _mm_xor_si128( sc->buffer[ptr24][n0], \
+   sc->buffer[ptr24][n0] = v128_xor( sc->buffer[ptr24][n0], \
                                          sc->buffer[ptr31][n2] ); \
-   sc->buffer[ptr31][n2] = _mm_xor_si128( sc->buffer[ptr31][n2], INW1(n2) ); \
+   sc->buffer[ptr31][n2] = v128_xor( sc->buffer[ptr31][n2], INW1(n2) ); \
 } while (0)

 #define BUPDATE_4W \
@@ -71,50 +71,50 @@ do { \
 } while (0)

 #define GAMMA_4W(n0, n1, n2, n4)   \
-   (g ## n0 = _mm_xor_si128( a ## n0, \
-                             _mm_or_si128( a ## n1, mm128_not( a ## n2 ) ) ) )
+   (g ## n0 = v128_xor( a ## n0, \
+                             v128_or( a ## n1, v128_not( a ## n2 ) ) ) )

 #define PI_ALL_4W   do { \
      a0  = g0; \
-      a1  = mm128_rol_32( g7,   1 ); \
-      a2  = mm128_rol_32( g14,  3 ); \
-      a3  = mm128_rol_32( g4,   6 ); \
-      a4  = mm128_rol_32( g11, 10 ); \
-      a5  = mm128_rol_32( g1,  15 ); \
-      a6  = mm128_rol_32( g8,  21 ); \
-      a7  = mm128_rol_32( g15, 28 ); \
-      a8  = mm128_rol_32( g5,   4 ); \
-      a9  = mm128_rol_32( g12, 13 ); \
-      a10 = mm128_rol_32( g2,  23 ); \
-      a11 = mm128_rol_32( g9,   2 ); \
-      a12 = mm128_rol_32( g16, 14 ); \
-      a13 = mm128_rol_32( g6,  27 ); \
-      a14 = mm128_rol_32( g13,  9 ); \
-      a15 = mm128_rol_32( g3,  24 ); \
-      a16 = mm128_rol_32( g10,  8 ); \
+      a1  = v128_rol32( g7,   1 ); \
+      a2  = v128_rol32( g14,  3 ); \
+      a3  = v128_rol32( g4,   6 ); \
+      a4  = v128_rol32( g11, 10 ); \
+      a5  = v128_rol32( g1,  15 ); \
+      a6  = v128_rol32( g8,  21 ); \
+      a7  = v128_rol32( g15, 28 ); \
+      a8  = v128_rol32( g5,   4 ); \
+      a9  = v128_rol32( g12, 13 ); \
+      a10 = v128_rol32( g2,  23 ); \
+      a11 = v128_rol32( g9,   2 ); \
+      a12 = v128_rol32( g16, 14 ); \
+      a13 = v128_rol32( g6,  27 ); \
+      a14 = v128_rol32( g13,  9 ); \
+      a15 = v128_rol32( g3,  24 ); \
+      a16 = v128_rol32( g10,  8 ); \
   } while (0)

 #define THETA_4W(n0, n1, n2, n4)   \
-   ( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
+   ( g ## n0 = v128_xor( a ## n0, v128_xor( a ## n1, a ## n4 ) ) )

 #define SIGMA_ALL_4W   do { \
-		a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \
-		a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
-		a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
-		a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
-		a4 = _mm_xor_si128( g4, INW2( 3 ) ); \
-		a5 = _mm_xor_si128( g5, INW2( 4 ) ); \
-		a6 = _mm_xor_si128( g6, INW2( 5 ) ); \
-		a7 = _mm_xor_si128( g7, INW2( 6 ) ); \
-		a8 = _mm_xor_si128( g8, INW2( 7 ) ); \
-		a9  = _mm_xor_si128( g9,  sc->buffer[ ptr16 ][0] ); \
-		a10 = _mm_xor_si128( g10, sc->buffer[ ptr16 ][1] ); \
-		a11 = _mm_xor_si128( g11, sc->buffer[ ptr16 ][2] ); \
-		a12 = _mm_xor_si128( g12, sc->buffer[ ptr16 ][3] ); \
-		a13 = _mm_xor_si128( g13, sc->buffer[ ptr16 ][4] ); \
-		a14 = _mm_xor_si128( g14, sc->buffer[ ptr16 ][5] ); \
-		a15 = _mm_xor_si128( g15, sc->buffer[ ptr16 ][6] ); \
-		a16 = _mm_xor_si128( g16, sc->buffer[ ptr16 ][7] ); \
+		a0 = v128_xor( g0, v128_32( 1 ) ); \
+		a1 = v128_xor( g1, INW2( 0 ) ); \
+		a2 = v128_xor( g2, INW2( 1 ) ); \
+		a3 = v128_xor( g3, INW2( 2 ) ); \
+		a4 = v128_xor( g4, INW2( 3 ) ); \
+		a5 = v128_xor( g5, INW2( 4 ) ); \
+		a6 = v128_xor( g6, INW2( 5 ) ); \
+		a7 = v128_xor( g7, INW2( 6 ) ); \
+		a8 = v128_xor( g8, INW2( 7 ) ); \
+		a9  = v128_xor( g9,  sc->buffer[ ptr16 ][0] ); \
+		a10 = v128_xor( g10, sc->buffer[ ptr16 ][1] ); \
+		a11 = v128_xor( g11, sc->buffer[ ptr16 ][2] ); \
+		a12 = v128_xor( g12, sc->buffer[ ptr16 ][3] ); \
+		a13 = v128_xor( g13, sc->buffer[ ptr16 ][4] ); \
+		a14 = v128_xor( g14, sc->buffer[ ptr16 ][5] ); \
+		a15 = v128_xor( g15, sc->buffer[ ptr16 ][6] ); \
+		a16 = v128_xor( g16, sc->buffer[ ptr16 ][7] ); \
 	} while (0)

 #define PANAMA_STEP_4W   do { \
@@ -138,7 +138,7 @@ panama_4way_push( panama_4way_context *sc, const unsigned char *pbuf,
 	LVARS_4W
 	unsigned ptr0;

-#define INW1(i)   casti_m128i( pbuf, i )
+#define INW1(i)   casti_v128( pbuf, i )
 #define INW2(i)   INW1(i)

 	M17( RSTATE );
@@ -167,7 +167,7 @@ panama_4way_pull( panama_4way_context *sc, unsigned num )
 #define INW1(i)     INW_H1(INC ## i)
 #define INW_H1(i)   INW_H2(i)
 #define INW_H2(i)   a ## i
-#define INW2(i)     casti_m128i( sc->buffer[ptr4], i )
+#define INW2(i)     casti_v128( sc->buffer[ptr4], i )

 	M17( RSTATE );
   ptr0 = sc->buffer_ptr;
@@ -254,7 +254,7 @@ panama_4way_update( void *cc, const void *data, size_t len )

   rlen = len & 31;
 	if ( rlen > 0 )
-      memcpy_128( (__m128i*)sc->data, (__m128i*)data  + len - rlen, rlen );
+      v128_memcpy( (v128_t*)sc->data, (v128_t*)data  + len - rlen, rlen );

 	sc->data_ptr = rlen;
 }
@@ -268,13 +268,13 @@ panama_4way_close( void *cc, void *dst )

 	sc = cc;
 	current = sc->data_ptr;
-	*(__m128i*)( sc->data + current ) = v128_32( 1 );
+	*(v128_t*)( sc->data + current ) = v128_32( 1 );
   current++;
-   memset_zero_128( (__m128i*)sc->data + current, 32 - current );
+   v128_memset_zero( (v128_t*)sc->data + current, 32 - current );
   panama_4way_push( sc, sc->data, 1 );
   panama_4way_pull( sc, 32 );
   for ( i = 0; i < 8; i ++ )
-      casti_m128i( dst, i ) = sc->state[i + 9];
+      casti_v128( dst, i ) = sc->state[i + 9];
 }


--- a/algo/panama/panama-hash-4way.h
+++ b/algo/panama/panama-hash-4way.h
@@ -11,8 +11,8 @@

 typedef struct {
   unsigned char data[32<<2];
-   __m128i buffer[32][8];
-   __m128i state[17];
+   v128_t buffer[32][8];
+   v128_t state[17];
   unsigned data_ptr;
   unsigned buffer_ptr;
 } panama_4way_context __attribute__ ((aligned (64)));
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -56,21 +56,20 @@ void deep_hash(void *output, const void *input)
        const int midlen = 64;            // bytes
        const int tail   = 80 - midlen;   // 16
        memcpy( &ctx.luffa, &deep_luffa_mid, sizeof deep_luffa_mid );
-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, 
-                                (const BitSequence*)input + midlen, tail );
+        update_and_final_luffa( &ctx.luffa, hash, 
+                                input + midlen, tail );

-        cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, 
-                              (const byte*) hash,64);
+        cubehashUpdateDigest( &ctx.cubehash, hash, 
+                               hash,64);

 #ifdef __AES__
-        update_final_echo ( &ctx.echo, (BitSequence *) hash,
-                          (const BitSequence *) hash, 512);
+        update_final_echo ( &ctx.echo,  hash,
+                           hash, 512);
 #else
        sph_echo512 (&ctx.echo, (const void*) hash, 64);
        sph_echo512_close(&ctx.echo, (void*) hash);
 #endif

-        asm volatile ("emms");
        memcpy(output, hash, 32);
 }

--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -82,7 +82,6 @@ void qubit_hash(void *output, const void *input)
        sph_echo512_close(&ctx.echo, (void*) hash);
 #endif

-        asm volatile ("emms");
        memcpy(output, hash, 32);
 }

--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -8,6 +8,7 @@
 #include <stdio.h>
 #include "sph_ripemd.h"
 #include "algo/sha/sha256-hash.h"
+#include "algo/sha/sha512-hash.h"

 void lbry_hash(void* output, const void* input)
 {
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -197,99 +197,99 @@ do{ \
 do{ \
   TYPE TA = ADD32( XA0, XA3 ); \
   TYPE TB = ADD32( XB0, XB3 ); \
-   TYPE T  = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   TYPE T  = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
   XA1 = XOR( XA1, T  ); \
   XA1 = XOR( XA1, TA  ); \
-   T = _mm_slli_epi32( TB, 7 );\
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 );\
+   TB = v128_sr32( TB, 25 ); \
   XB1 = XOR( XB1, T ); \
   XB1 = XOR( XB1, TB ); \
 \
   TA = ADD32( XA1, XA0 ); \
   TB = ADD32( XB1, XB0 ); \
-   T  = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T  = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
   XA2 = XOR( XA2, T ); \
   XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 );\
-   TB = _mm_srli_epi32( TB, 23 );\
+   T = v128_sl32( TB, 9 );\
+   TB = v128_sr32( TB, 23 );\
   XB2 = XOR( XB2, T ); \
   XB2 = XOR( XB2, TB ); \
 \
   TA = ADD32( XA2, XA1 ); \
   TB = ADD32( XB2, XB1 ); \
-   T  = _mm_slli_epi32( TA, 13); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T  = v128_sl32( TA, 13); \
+   TA = v128_sr32( TA, 19 ); \
   XA1 = ROL_1X32( XA1 ); \
   XB1 = ROL_1X32( XB1 ); \
   XA3 = XOR( XA3, T ); \
   XA3 = XOR( XA3, TA ); \
-   T  = _mm_slli_epi32( TB, 13); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T  = v128_sl32( TB, 13); \
+   TB = v128_sr32( TB, 19 ); \
   XB3 = XOR( XB3, T ); \
   XB3 = XOR( XB3, TB ); \
 \
   TA = ADD32( XA3, XA2 ); \
   TB = ADD32( XB3, XB2 ); \
-   T  = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T  = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
   XA2 = SWAP_64( XA2 ); \
   XB2 = SWAP_64( XB2 ); \
   XA0 = XOR( XA0, T ); \
   XA0 = XOR( XA0, TA ); \
-   T  = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T  = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
   XB0 = XOR( XB0, T ); \
   XB0 = XOR( XB0, TB ); \
 \
   TA = ADD32( XA0, XA1 ); \
   TB = ADD32( XB0, XB1 ); \
-   T = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   T = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
   XA3 = ROR_1X32( XA3 ); \
   XA3 = XOR( XA3, T ); \
   XA3 = XOR( XA3, TA ); \
-   T = _mm_slli_epi32( TB, 7 ); \
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 ); \
+   TB = v128_sr32( TB, 25 ); \
   XB3 = ROR_1X32( XB3 ); \
   XB3 = XOR( XB3, T ); \
   XB3 = XOR( XB3, TB ); \
 \
   TA = ADD32( XA3, XA0 ); \
   TB = ADD32( XB3, XB0 ); \
-   T = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
   XA2 = XOR( XA2, T ); \
   XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 ); \
-   TB = _mm_srli_epi32( TB, 23 ); \
+   T = v128_sl32( TB, 9 ); \
+   TB = v128_sr32( TB, 23 ); \
   XB2 = XOR( XB2, T ); \
   XB2 = XOR( XB2, TB ); \
 \
   TA = ADD32( XA2, XA3 ); \
   TB = ADD32( XB2, XB3 ); \
-   T = _mm_slli_epi32( TA, 13 ); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T = v128_sl32( TA, 13 ); \
+   TA = v128_sr32( TA, 19 ); \
   XA3 = ROL_1X32( XA3 ); \
   XB3 = ROL_1X32( XB3 ); \
   XA1 = XOR( XA1, T ); \
   XA1 = XOR( XA1, TA ); \
-   T = _mm_slli_epi32( TB, 13 ); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T = v128_sl32( TB, 13 ); \
+   TB = v128_sr32( TB, 19 ); \
   XB1 = XOR( XB1, T ); \
   XB1 = XOR( XB1, TB ); \
 \
   TA = ADD32( XA1, XA2 ); \
   TB = ADD32( XB1, XB2 ); \
-   T = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
   XA2 = SWAP_64( XA2 ); \
   XB2 = SWAP_64( XB2 ); \
   XA0 = XOR( XA0, T ); \
   XA0 = XOR( XA0, TA ); \
-   T = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
   XA1 = ROR_1X32( XA1 ); \
   XB0 = XOR( XB0, T ); \
   XB0 = XOR( XB0, TB ); \
@@ -423,88 +423,88 @@ do{ \
   TYPE TA = ADD32( XA0, XA3 ); \
   TYPE TB = ADD32( XB0, XB3 ); \
   TYPE TC = ADD32( XC0, XC3 ); \
-   TYPE T  = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   TYPE T  = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
   XA1 = XOR( XA1, T  ); \
   XA1 = XOR( XA1, TA  ); \
-   T = _mm_slli_epi32( TB, 7 );\
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 );\
+   TB = v128_sr32( TB, 25 ); \
   XB1 = XOR( XB1, T ); \
   XB1 = XOR( XB1, TB ); \
-   T = _mm_slli_epi32( TC, 7 );\
-   TC = _mm_srli_epi32( TC, 25 );\
+   T = v128_sl32( TC, 7 );\
+   TC = v128_sr32( TC, 25 );\
   XC1 = XOR( XC1, T ); \
   XC1 = XOR( XC1, TC ); \
 \
   TA = ADD32( XA1, XA0 ); \
   TB = ADD32( XB1, XB0 ); \
   TC = ADD32( XC1, XC0 ); \
-   T  = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T  = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
   XA2 = XOR( XA2, T ); \
   XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 );\
-   TB = _mm_srli_epi32( TB, 23 );\
+   T = v128_sl32( TB, 9 );\
+   TB = v128_sr32( TB, 23 );\
   XB2 = XOR( XB2, T ); \
   XB2 = XOR( XB2, TB ); \
-   T = _mm_slli_epi32( TC, 9 );\
-   TC = _mm_srli_epi32( TC, 23 );\
+   T = v128_sl32( TC, 9 );\
+   TC = v128_sr32( TC, 23 );\
   XC2 = XOR( XC2, T ); \
   XC2 = XOR( XC2, TC ); \
 \
   TA = ADD32( XA2, XA1 ); \
   TB = ADD32( XB2, XB1 ); \
   TC = ADD32( XC2, XC1 ); \
-   T  = _mm_slli_epi32( TA, 13); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T  = v128_sl32( TA, 13); \
+   TA = v128_sr32( TA, 19 ); \
   XA1 = ROL_1X32( XA1 ); \
   XB1 = ROL_1X32( XB1 ); \
   XC1 = ROL_1X32( XC1 ); \
   XA3 = XOR( XA3, T ); \
   XA3 = XOR( XA3, TA ); \
-   T  = _mm_slli_epi32( TB, 13); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T  = v128_sl32( TB, 13); \
+   TB = v128_sr32( TB, 19 ); \
   XB3 = XOR( XB3, T ); \
   XB3 = XOR( XB3, TB ); \
-   T  = _mm_slli_epi32( TC, 13); \
-   TC = _mm_srli_epi32( TC, 19 ); \
+   T  = v128_sl32( TC, 13); \
+   TC = v128_sr32( TC, 19 ); \
   XC3 = XOR( XC3, T ); \
   XC3 = XOR( XC3, TC ); \
 \
   TA = ADD32( XA3, XA2 ); \
   TB = ADD32( XB3, XB2 ); \
   TC = ADD32( XC3, XC2 ); \
-   T  = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T  = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
   XA2 = SWAP_64( XA2 ); \
   XB2 = SWAP_64( XB2 ); \
   XC2 = SWAP_64( XC2 ); \
   XA0 = XOR( XA0, T ); \
   XA0 = XOR( XA0, TA ); \
-   T  = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T  = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
   XB0 = XOR( XB0, T ); \
   XB0 = XOR( XB0, TB ); \
-   T = _mm_slli_epi32( TC, 18 ); \
-   TC = _mm_srli_epi32( TC, 14 ); \
+   T = v128_sl32( TC, 18 ); \
+   TC = v128_sr32( TC, 14 ); \
   XC0 = XOR( XC0, T ); \
   XC0 = XOR( XC0, TC ); \
 \
   TA = ADD32( XA0, XA1 ); \
   TB = ADD32( XB0, XB1 ); \
   TC = ADD32( XC0, XC1 ); \
-   T = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   T = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
   XA3 = ROR_1X32( XA3 ); \
   XA3 = XOR( XA3, T ); \
   XA3 = XOR( XA3, TA ); \
-   T = _mm_slli_epi32( TB, 7 ); \
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 ); \
+   TB = v128_sr32( TB, 25 ); \
   XB3 = ROR_1X32( XB3 ); \
   XB3 = XOR( XB3, T ); \
   XB3 = XOR( XB3, TB ); \
-   T = _mm_slli_epi32( TC, 7 ); \
-   TC = _mm_srli_epi32( TC, 25 ); \
+   T = v128_sl32( TC, 7 ); \
+   TC = v128_sr32( TC, 25 ); \
   XC3 = ROR_1X32( XC3 ); \
   XC3 = XOR( XC3, T ); \
   XC3 = XOR( XC3, TC ); \
@@ -512,55 +512,55 @@ do{ \
   TA = ADD32( XA3, XA0 ); \
   TB = ADD32( XB3, XB0 ); \
   TC = ADD32( XC3, XC0 ); \
-   T = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
   XA2 = XOR( XA2, T ); \
   XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 ); \
-   TB = _mm_srli_epi32( TB, 23 ); \
+   T = v128_sl32( TB, 9 ); \
+   TB = v128_sr32( TB, 23 ); \
   XB2 = XOR( XB2, T ); \
   XB2 = XOR( XB2, TB ); \
-   T = _mm_slli_epi32( TC, 9 ); \
-   TC = _mm_srli_epi32( TC, 23 ); \
+   T = v128_sl32( TC, 9 ); \
+   TC = v128_sr32( TC, 23 ); \
   XC2 = XOR( XC2, T ); \
   XC2 = XOR( XC2, TC ); \
 \
   TA = ADD32( XA2, XA3 ); \
   TB = ADD32( XB2, XB3 ); \
   TC = ADD32( XC2, XC3 ); \
-   T = _mm_slli_epi32( TA, 13 ); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T = v128_sl32( TA, 13 ); \
+   TA = v128_sr32( TA, 19 ); \
   XA3 = ROL_1X32( XA3 ); \
   XB3 = ROL_1X32( XB3 ); \
   XC3 = ROL_1X32( XC3 ); \
   XA1 = XOR( XA1, T ); \
   XA1 = XOR( XA1, TA ); \
-   T = _mm_slli_epi32( TB, 13 ); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T = v128_sl32( TB, 13 ); \
+   TB = v128_sr32( TB, 19 ); \
   XB1 = XOR( XB1, T ); \
   XB1 = XOR( XB1, TB ); \
-   T = _mm_slli_epi32( TC, 13 ); \
-   TC = _mm_srli_epi32( TC, 19 ); \
+   T = v128_sl32( TC, 13 ); \
+   TC = v128_sr32( TC, 19 ); \
   XC1 = XOR( XC1, T ); \
   XC1 = XOR( XC1, TC ); \
 \
   TA = ADD32( XA1, XA2 ); \
   TB = ADD32( XB1, XB2 ); \
   TC = ADD32( XC1, XC2 ); \
-   T = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
   XA2 = SWAP_64( XA2 ); \
   XB2 = SWAP_64( XB2 ); \
   XA0 = XOR( XA0, T ); \
   XA0 = XOR( XA0, TA ); \
-   T = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
   XC2 = SWAP_64( XC2 ); \
   XA1 = ROR_1X32( XA1 ); \
   XB0 = XOR( XB0, T ); \
   XB0 = XOR( XB0, TB ); \
-   T = _mm_slli_epi32( TC, 18 ); \
-   TC = _mm_srli_epi32( TC, 14 ); \
+   T = v128_sl32( TC, 18 ); \
+   TC = v128_sr32( TC, 14 ); \
   XB1 = ROR_1X32( XB1 ); \
   XC1 = ROR_1X32( XC1 ); \
   XC0 = XOR( XC0, T ); \
@@ -832,7 +832,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )

 // Working, not up to date, needs stream, shuffle optimizations.
 // 4x32 interleaving
-static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
+static void salsa8_simd128_4way( v128_t *b, const v128_t *c )
 {
   __m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
   __m512i *B = (__m512i*)b; 
@@ -902,7 +902,7 @@ static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 // { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2,
 //   l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 }

-void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
+void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N )
 {
   for ( int n = 0; n < N; n++ )
   {
@@ -923,7 +923,7 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )

      for( int i = 0; i < 32; i++ )
      {
-         X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3],
+         X[i] = v128_xor( X[i], v128_set_32( v[ x16[3] + i ].u32[3],
                                                    v[ x16[2] + i ].u32[2],
                                                    v[ x16[1] + i ].u32[1],
                                                    v[ x16[0] + i ].u32[0] ) );
@@ -2003,28 +2003,28 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
 // Scrypt 2x faster than pooler
 // 4x memory usage
 // 4x32 interleaving
-static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
+static void xor_salsa8_4way( v128_t * const B, const v128_t * const C )
 {
-   __m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] );
-   __m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] );
-   __m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] );
-   __m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] );
-   __m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] );
-   __m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] );
-   __m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] );
-   __m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] );
-   __m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] );
-   __m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] );
-   __m128i xa = B[10] = _mm_xor_si128( B[10], C[10] );
-   __m128i xb = B[11] = _mm_xor_si128( B[11], C[11] );
-   __m128i xc = B[12] = _mm_xor_si128( B[12], C[12] );
-   __m128i xd = B[13] = _mm_xor_si128( B[13], C[13] );
-   __m128i xe = B[14] = _mm_xor_si128( B[14], C[14] );
-   __m128i xf = B[15] = _mm_xor_si128( B[15], C[15] );
+   v128_t x0 = B[ 0] = v128_xor( B[ 0], C[ 0] );
+   v128_t x1 = B[ 1] = v128_xor( B[ 1], C[ 1] );
+   v128_t x2 = B[ 2] = v128_xor( B[ 2], C[ 2] );
+   v128_t x3 = B[ 3] = v128_xor( B[ 3], C[ 3] );
+   v128_t x4 = B[ 4] = v128_xor( B[ 4], C[ 4] );
+   v128_t x5 = B[ 5] = v128_xor( B[ 5], C[ 5] );
+   v128_t x6 = B[ 6] = v128_xor( B[ 6], C[ 6] );
+   v128_t x7 = B[ 7] = v128_xor( B[ 7], C[ 7] );
+   v128_t x8 = B[ 8] = v128_xor( B[ 8], C[ 8] );
+   v128_t x9 = B[ 9] = v128_xor( B[ 9], C[ 9] );
+   v128_t xa = B[10] = v128_xor( B[10], C[10] );
+   v128_t xb = B[11] = v128_xor( B[11], C[11] );
+   v128_t xc = B[12] = v128_xor( B[12], C[12] );
+   v128_t xd = B[13] = v128_xor( B[13], C[13] );
+   v128_t xe = B[14] = v128_xor( B[14], C[14] );
+   v128_t xf = B[15] = v128_xor( B[15], C[15] );

-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor

   SALSA_8ROUNDS;

@@ -2032,25 +2032,25 @@ static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
   #undef ADD32
   #undef XOR 

-   B[ 0] = _mm_add_epi32( B[ 0], x0 );
-   B[ 1] = _mm_add_epi32( B[ 1], x1 );
-   B[ 2] = _mm_add_epi32( B[ 2], x2 );
-   B[ 3] = _mm_add_epi32( B[ 3], x3 );
-   B[ 4] = _mm_add_epi32( B[ 4], x4 );
-   B[ 5] = _mm_add_epi32( B[ 5], x5 );
-   B[ 6] = _mm_add_epi32( B[ 6], x6 );
-   B[ 7] = _mm_add_epi32( B[ 7], x7 );
-   B[ 8] = _mm_add_epi32( B[ 8], x8 );
-   B[ 9] = _mm_add_epi32( B[ 9], x9 );
-   B[10] = _mm_add_epi32( B[10], xa );
-   B[11] = _mm_add_epi32( B[11], xb );
-   B[12] = _mm_add_epi32( B[12], xc );
-   B[13] = _mm_add_epi32( B[13], xd );
-   B[14] = _mm_add_epi32( B[14], xe );
-   B[15] = _mm_add_epi32( B[15], xf );
+   B[ 0] = v128_add32( B[ 0], x0 );
+   B[ 1] = v128_add32( B[ 1], x1 );
+   B[ 2] = v128_add32( B[ 2], x2 );
+   B[ 3] = v128_add32( B[ 3], x3 );
+   B[ 4] = v128_add32( B[ 4], x4 );
+   B[ 5] = v128_add32( B[ 5], x5 );
+   B[ 6] = v128_add32( B[ 6], x6 );
+   B[ 7] = v128_add32( B[ 7], x7 );
+   B[ 8] = v128_add32( B[ 8], x8 );
+   B[ 9] = v128_add32( B[ 9], x9 );
+   B[10] = v128_add32( B[10], xa );
+   B[11] = v128_add32( B[11], xb );
+   B[12] = v128_add32( B[12], xc );
+   B[13] = v128_add32( B[13], xd );
+   B[14] = v128_add32( B[14], xe );
+   B[15] = v128_add32( B[15], xf );
 }

-void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
+void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
 {
   for ( int n = 0; n < N; n++ )
   {
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
         m128_ovly v;    
         for ( int l = 0; l < 4; l++ )
            v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
-         X[i] = _mm_xor_si128( X[i], v.m128 );
+         X[i] = v128_xor( X[i], v.m128 );
      }

      xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2095,27 +2095,27 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
 // No interleaving
 static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
 {
-   __m128i X0, X1, X2, X3;
-   __m128i *B = (__m128i*)b;
-   const __m128i *C = (const __m128i*)c;
+   v128_t X0, X1, X2, X3;
+   v128_t *B = (v128_t*)b;
+   const v128_t *C = (const v128_t*)c;

   // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
+   #define ROL_1X32    v128_shufll32
+   #define ROR_1X32    v128_shuflr32
+   #define SWAP_64     v128_swap64
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor
   
   // mix C into B then shuffle B into X
-   B[0] = _mm_xor_si128( B[0], C[0] );
-   B[1] = _mm_xor_si128( B[1], C[1] );
-   B[2] = _mm_xor_si128( B[2], C[2] );
-   B[3] = _mm_xor_si128( B[3], C[3] );
+   B[0] = v128_xor( B[0], C[0] );
+   B[1] = v128_xor( B[1], C[1] );
+   B[2] = v128_xor( B[2], C[2] );
+   B[3] = v128_xor( B[3], C[3] );

 #if defined(__SSE4_1__)

-   __m128i Y0, Y1, Y2, Y3;
+   v128_t Y0, Y1, Y2, Y3;

 #if defined(__AVX2__)
   
@@ -2188,19 +2188,19 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)

 #endif   // AVX2 else SSE4_1

-   B[0] = _mm_add_epi32( B[0], Y0 );
-   B[1] = _mm_add_epi32( B[1], Y1 );
-   B[2] = _mm_add_epi32( B[2], Y2 );
-   B[3] = _mm_add_epi32( B[3], Y3 );
+   B[0] = v128_add32( B[0], Y0 );
+   B[1] = v128_add32( B[1], Y1 );
+   B[2] = v128_add32( B[2], Y2 );
+   B[3] = v128_add32( B[3], Y3 );

 #else  // SSE2

   m128_ovly y[4], z[4];

-   X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] );
-   X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] );
-   X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] );
-   X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] );
+   X0 = v128_set_32( b[15], b[10], b[ 5], b[ 0] );
+   X1 = v128_set_32( b[ 3], b[14], b[ 9], b[ 4] );
+   X2 = v128_set_32( b[ 7], b[ 2], b[13], b[ 8] );
+   X3 = v128_set_32( b[11], b[ 6], b[ 1], b[12] );
   
   SALSA_8ROUNDS_FINAL_SIMD128;

@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
   z[3].u32[1] = y[2].u32[3];
   z[3].u32[0] = y[3].u32[3];

-   B[0] = _mm_add_epi32( B[0], z[0].m128 );
-   B[1] = _mm_add_epi32( B[1], z[1].m128 );
-   B[2] = _mm_add_epi32( B[2], z[2].m128 );
-   B[3] = _mm_add_epi32( B[3], z[3].m128 );
+   B[0] = v128_add32( B[0], z[0].m128 );
+   B[1] = v128_add32( B[1], z[1].m128 );
+   B[2] = v128_add32( B[2], z[2].m128 );
+   B[3] = v128_add32( B[3], z[3].m128 );

 #endif

@@ -2257,7 +2257,7 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
   for ( int n = 0; n < N; n++ )
   {
      for ( int i = 0; i < 8; i++ )
-         _mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) );
+         _mm_stream_si128( (v128_t*)V + n*8 + i, casti_v128( X, i ) );

      salsa8_simd128( &X[ 0], &X[16] );
      salsa8_simd128( &X[16], &X[ 0] );
@@ -2277,15 +2277,15 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )

 static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 {
-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;

 #if defined(__SSE4_1__)

-  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
-  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
-  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
@@ -2301,16 +2301,16 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )

 #else   // SSE2

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
   
-   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
-   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
-   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
-   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
-   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
-   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
-   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
-   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
+   YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] );

   XA[0] = YA0;
   XB[0] = YB0;
@@ -2327,15 +2327,15 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 {

-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;

 #if defined(__SSE4_1__)

-  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
-  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
-  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
@@ -2413,29 +2413,29 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
                       const uint32_t * const ca, const uint32_t * const cb )
 {
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
+   v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   v128_t *BA = (v128_t*)ba;
+   v128_t *BB = (v128_t*)bb;
+   const v128_t *CA = (const v128_t*)ca;
+   const v128_t *CB = (const v128_t*)cb;

   // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
+   #define ROL_1X32    v128_shufll32
+   #define ROR_1X32    v128_shuflr32
+   #define SWAP_64     v128_swap64
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor
+   #define TYPE        v128_t

-   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   XA0 = BA[0] = v128_xor( BA[0], CA[0] );
+   XB0 = BB[0] = v128_xor( BB[0], CB[0] );
+   XA1 = BA[1] = v128_xor( BA[1], CA[1] );
+   XB1 = BB[1] = v128_xor( BB[1], CB[1] );
+   XA2 = BA[2] = v128_xor( BA[2], CA[2] );
+   XB2 = BB[2] = v128_xor( BB[2], CB[2] );
+   XA3 = BA[3] = v128_xor( BA[3], CA[3] );
+   XB3 = BB[3] = v128_xor( BB[3], CB[3] );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
             
@@ -2447,14 +2447,14 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
   
 #endif

-   BA[0] = _mm_add_epi32( BA[0], XA0 );
-   BB[0] = _mm_add_epi32( BB[0], XB0 );
-   BA[1] = _mm_add_epi32( BA[1], XA1 );
-   BB[1] = _mm_add_epi32( BB[1], XB1 );
-   BA[2] = _mm_add_epi32( BA[2], XA2 );
-   BB[2] = _mm_add_epi32( BB[2], XB2 );
-   BA[3] = _mm_add_epi32( BA[3], XA3 );
-   BB[3] = _mm_add_epi32( BB[3], XB3 );
+   BA[0] = v128_add32( BA[0], XA0 );
+   BB[0] = v128_add32( BB[0], XB0 );
+   BA[1] = v128_add32( BA[1], XA1 );
+   BB[1] = v128_add32( BB[1], XB1 );
+   BA[2] = v128_add32( BA[2], XA2 );
+   BB[2] = v128_add32( BB[2], XB2 );
+   BA[3] = v128_add32( BA[3], XA3 );
+   BB[3] = v128_add32( BB[3], XB3 );

   #undef ROL_1X32
   #undef ROR_1X32
@@ -2489,8 +2489,8 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )

      for ( int i = 0; i < 8; i++ )
      {
-         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
-         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
+         _mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) );
+         _mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) );
      }

   #else
@@ -2535,10 +2535,10 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
      const int j1 = 8 * ( X1[16] & ( N-1 ) );
      for ( int i = 0; i < 8; i++ )
      {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
-         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
-         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
+         const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i );
+         const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i );
+         casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 );
+         casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 );
      }

   #endif
@@ -2555,16 +2555,16 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
                                        uint32_t *xc )
 {
-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
-   __m128i *XC = (__m128i*)xc;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;
+   v128_t *XC = (v128_t*)xc;

 #if defined(__SSE4_1__)

-  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
-  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
-  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
  XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
  XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
  XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
@@ -2588,20 +2588,20 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,

 #else   // SSE2

-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+   v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;

-   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
-   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
-   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
-   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
-   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
-   YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
-   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
-   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
-   YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
-   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
-   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
-   YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] );
+   YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YC0 = v128_set_32( xc[15], xc[10], xc[ 5], xc[ 0] );
+   YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YC1 = v128_set_32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
+   YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YC2 = v128_set_32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
+   YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] );
+   YC3 = v128_set_32( xc[11], xc[ 6], xc[ 1], xc[12] );

   XA[0] = YA0;
   XB[0] = YB0;
@@ -2622,16 +2622,16 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
 static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
                                          uint32_t* xc )
 {
-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
-   __m128i *XC = (__m128i*)xc;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;
+   v128_t *XC = (v128_t*)xc;

 #if defined(__SSE4_1__)

-  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
-  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
-  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
  XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
  XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
  XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
@@ -2743,36 +2743,36 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
 static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
               const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
 {
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+   v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
           XC0, XC1, XC2, XC3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   __m128i *BC = (__m128i*)bc;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
-   const __m128i *CC = (const __m128i*)cc;
+   v128_t *BA = (v128_t*)ba;
+   v128_t *BB = (v128_t*)bb;
+   v128_t *BC = (v128_t*)bc;
+   const v128_t *CA = (const v128_t*)ca;
+   const v128_t *CB = (const v128_t*)cb;
+   const v128_t *CC = (const v128_t*)cc;

   // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
+   #define ROL_1X32    v128_shufll32
+   #define ROR_1X32    v128_shuflr32
+   #define SWAP_64     v128_swap64
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor
+   #define TYPE        v128_t

-   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] );
-   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] );
-   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] );
-   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
-   XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] );
+   XA0 = BA[0] = v128_xor( BA[0], CA[0] );
+   XB0 = BB[0] = v128_xor( BB[0], CB[0] );
+   XC0 = BC[0] = v128_xor( BC[0], CC[0] );
+   XA1 = BA[1] = v128_xor( BA[1], CA[1] );
+   XB1 = BB[1] = v128_xor( BB[1], CB[1] );
+   XC1 = BC[1] = v128_xor( BC[1], CC[1] );
+   XA2 = BA[2] = v128_xor( BA[2], CA[2] );
+   XB2 = BB[2] = v128_xor( BB[2], CB[2] );
+   XC2 = BC[2] = v128_xor( BC[2], CC[2] );
+   XA3 = BA[3] = v128_xor( BA[3], CA[3] );
+   XB3 = BB[3] = v128_xor( BB[3], CB[3] );
+   XC3 = BC[3] = v128_xor( BC[3], CC[3] );
      
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
   
@@ -2784,18 +2784,18 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,

 #endif

-   BA[0] = _mm_add_epi32( BA[0], XA0 );
-   BB[0] = _mm_add_epi32( BB[0], XB0 );
-   BC[0] = _mm_add_epi32( BC[0], XC0 );
-   BA[1] = _mm_add_epi32( BA[1], XA1 );
-   BB[1] = _mm_add_epi32( BB[1], XB1 );
-   BC[1] = _mm_add_epi32( BC[1], XC1 );
-   BA[2] = _mm_add_epi32( BA[2], XA2 );
-   BB[2] = _mm_add_epi32( BB[2], XB2 );
-   BC[2] = _mm_add_epi32( BC[2], XC2 );
-   BA[3] = _mm_add_epi32( BA[3], XA3 );
-   BB[3] = _mm_add_epi32( BB[3], XB3 );
-   BC[3] = _mm_add_epi32( BC[3], XC3 );
+   BA[0] = v128_add32( BA[0], XA0 );
+   BB[0] = v128_add32( BB[0], XB0 );
+   BC[0] = v128_add32( BC[0], XC0 );
+   BA[1] = v128_add32( BA[1], XA1 );
+   BB[1] = v128_add32( BB[1], XB1 );
+   BC[1] = v128_add32( BC[1], XC1 );
+   BA[2] = v128_add32( BA[2], XA2 );
+   BB[2] = v128_add32( BB[2], XB2 );
+   BC[2] = v128_add32( BC[2], XC2 );
+   BA[3] = v128_add32( BA[3], XA3 );
+   BB[3] = v128_add32( BB[3], XB3 );
+   BC[3] = v128_add32( BC[3], XC3 );

   #undef ROL_1X32
   #undef ROR_1X32
@@ -2833,9 +2833,9 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )

      for ( int i = 0; i < 8; i++ )
      {
-         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
-         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
-         _mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) );
+         _mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) );
+         _mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) );
+         _mm_stream_si128( (v128_t*)V2 + n*8 + i, casti_v128( X2, i ) );
      }

   #else
@@ -2891,12 +2891,12 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
      const int j2 = 8 * ( X2[16] & ( N-1 ) );
      for ( int i = 0; i < 8; i++ )
      {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
-         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i );
-         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
-         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
-         casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 );
+         const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i );
+         const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i );
+         const v128_t v2 = v128_load( ( (v128_t*)V2 ) +j2+i );
+         casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 );
+         casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 );
+         casti_v128( X2, i ) = v128_xor( casti_v128( X2, i ), v2 );
      }

   #endif
--- a/algo/scrypt/scrypt-core-4way.h
+++ b/algo/scrypt/scrypt-core-4way.h
@@ -10,7 +10,7 @@
 void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );

 // Serial SIMD over 4 way parallel
-void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
+void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N );

 // 4 way parallel over serial SIMD
 void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
@@ -44,10 +44,8 @@ void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N )

 #endif

-#if defined(__SSE2__)
-
 // Parallel 4 way, 4x memory
-void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
+void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N );

 // Linear SIMD 1 way, 1x memory, lowest
 void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
@@ -61,8 +59,6 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
 // Quadruple buffered, 4x memory
 void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );

-#endif
-
 // For reference only
 void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );

--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -173,7 +173,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
   memcpy( pad1, key1 + 16, 16 );
   memcpy( pad1 + 4, keypad, 48 );

-   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
+   sha256_2x_transform_le( tstate0, tstate1, pad0, pad1,
 		               tstate0, tstate1 );

   memcpy( ihash0, tstate0, 32 );
@@ -186,7 +186,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
   }
   for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;

-   sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
+   sha256_2x_transform_le( ostate0, ostate1, pad0, pad1,
                               sha256_initial_state, sha256_initial_state );

   for ( i = 0; i < 8; i++ )
@@ -196,7 +196,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
   }
   for ( ; i < 16; i++ )      pad0[i] = pad1[i] = 0x36363636;

-   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, 
+   sha256_2x_transform_le( tstate0, tstate1, pad0, pad1, 
                               sha256_initial_state, sha256_initial_state );
 }

@@ -209,7 +209,7 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
   uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
   int i, j;

-   sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
+   sha256_2x_transform_le( istate0, istate1, salt0, salt1,
                               tstate0, tstate1 );

   memcpy( ibuf0, salt0 + 16, 16 );
@@ -225,10 +225,10 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
      memcpy( obuf1, istate1, 32 );
      ibuf0[4] = ibuf1[4] = i + 1;

-      sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
-                                  obuf0, obuf1 );
-      sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
-                                  ostate0, ostate1 );
+      sha256_2x_transform_le( obuf0, obuf1, ibuf0, ibuf1,
+                              obuf0, obuf1 );
+      sha256_2x_transform_le( ostateb0, ostateb1, obuf0, obuf1,
+                              ostate0, ostate1 );
      
      for ( j = 0; j < 8; j++ )
      {
@@ -246,20 +246,20 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
   uint32_t buf0[16], buf1[16];
   int i;

-   sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
-                               tstate0, tstate1 );   
-   sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
-                               tstate0, tstate1 );
-   sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
-                               tstate0, tstate1 );
+   sha256_2x_transform_be( tstate0, tstate1, salt0, salt1,
+                           tstate0, tstate1 );   
+   sha256_2x_transform_be( tstate0, tstate1, salt0+16, salt1+16,
+                           tstate0, tstate1 );
+   sha256_2x_transform_le( tstate0, tstate1, finalblk, finalblk,
+                           tstate0, tstate1 );

   memcpy( buf0, tstate0, 32 );
   memcpy( buf0 + 8, outerpad, 32 );
   memcpy( buf1, tstate1, 32 );
   memcpy( buf1 + 8, outerpad, 32 );

-   sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
-                               ostate0, ostate1 );
+   sha256_2x_transform_le( ostate0, ostate1, buf0, buf1,
+                           ostate0, ostate1 );

   for ( i = 0; i < 8; i++ )
   {
@@ -272,8 +272,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,

 #endif

-#ifdef HAVE_SHA256_4WAY
-
 static const uint32_t keypad_4way[4 * 12] = {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -335,14 +333,14 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {

 static inline void sha256_4way_init_state( void *state )
 {
-   casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 );
-   casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 );
-   casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 );
-   casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A );
-   casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F );
-   casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C );
-   casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB );
-   casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 );
+   casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
+   casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
+   casti_v128( state, 2 ) = v128_32( 0x3C6EF372 );
+   casti_v128( state, 3 ) = v128_32( 0xA54FF53A );
+   casti_v128( state, 4 ) = v128_32( 0x510E527F );
+   casti_v128( state, 5 ) = v128_32( 0x9B05688C );
+   casti_v128( state, 6 ) = v128_32( 0x1F83D9AB );
+   casti_v128( state, 7 ) = v128_32( 0x5BE0CD19 );
 }

 static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
@@ -356,22 +354,22 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
 	memcpy( pad, key + 4*16, 4*16 );
 	memcpy( pad + 4*4, keypad_4way, 4*48 );

-   sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad,
+                             (const v128_t*)tstate );

   sha256_4way_init_state( tstate );

 	for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x5c5c5c5c;

-   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad,
+                             (const v128_t*)tstate );
   
   for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x36363636;

-   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad,
+                             (const v128_t*)tstate );
 }

 static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
@@ -383,8 +381,8 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 	uint32_t _ALIGN(16) obuf[4 * 16];
 	int i, j;

-   sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt,
+                             (const v128_t*)tstate );
 	
 	memcpy(ibuf, salt + 4 * 16, 4 * 16);
 	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
@@ -397,11 +395,11 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 		ibuf[4 * 4 + 2] = i + 1;
 		ibuf[4 * 4 + 3] = i + 1;

-      sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf,
-                                (const __m128i*)istate );
+      sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
+                                (const v128_t*)istate );
      
-      sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf,
-                                (const __m128i*)ostate );
+      sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
+                                (const v128_t*)ostate );

      for ( j = 0; j < 4 * 8; j++ )
 			output[4 * 8 * i + j] = bswap_32( ostate2[j] );
@@ -411,38 +409,36 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
               uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
-   __m128i _ALIGN(64) final[ 8*16 ];
+   v128_t _ALIGN(64) final[ 8*16 ];
 	uint32_t _ALIGN(64) buf[4 * 16];
 	int i;
 	
-   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt,
-                       (const __m128i*)tstate );
-   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16),
-                       (const __m128i*)tstate );
+   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt,
+                       (const v128_t*)tstate );
+   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
+                       (const v128_t*)tstate );

-   final[ 0] = _mm_set1_epi32( 0x00000001 );
-   final[ 1] = _mm_set1_epi32( 0x80000000 );
+   final[ 0] = v128_32( 0x00000001 );
+   final[ 1] = v128_32( 0x80000000 );
   final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
             = final[ 7] = final[ 8] = final[ 9] = final[10]
             = final[11] = final[12] = final[13] = final[14]
-             = _mm_setzero_si128();
-   final[15] = _mm_set1_epi32 ( 0x00000620 );
+             = v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
+   final[15] = v128_32 ( 0x00000620 );

-   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final,
-                       (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final,
+                       (const v128_t*)tstate );
   
   memcpy(buf, tstate, 4 * 32);
 	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);

-   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf,
-                             (const __m128i*)ostate );
+   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf,
+                             (const v128_t*)ostate );

   for ( i = 0; i < 4 * 8; i++ )
 		output[i] = bswap_32( ostate[i] );
 }

-#endif /* HAVE_SHA256_4WAY */
-

 #ifdef HAVE_SHA256_8WAY

@@ -878,9 +874,9 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
   // SSE2 working
   intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
   intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
-   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N ); 
+   scrypt_core_4way( (v128_t*) W,      (v128_t*)V, N ); 
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); 
+   scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N ); 
   dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
   dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
 */
@@ -1016,13 +1012,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
   intrlv_4x32( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
   intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
-   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)W, (v128_t*)V, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)(W+128), (v128_t*)V, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)(W+256), (v128_t*)V, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)(W+256+128), (v128_t*)V, N );
   dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
   dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
   dintrlv_4x32( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
@@ -1138,9 +1134,9 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
   // SSE2 working
   intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
   intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
-   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N );
+   scrypt_core_4way( (v128_t*) W,      (v128_t*)V, N );
   if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N );
   dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
   dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
 */
@@ -1339,7 +1335,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,

   intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
   for ( int i = 0; i < 8; i++ )
-      casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] );
+      casti_v128( tstate, i ) = v128_32( midstate[i] );

   HMAC_SHA256_80_init_4way(W, tstate, ostate);
   PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
@@ -1354,7 +1350,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
      intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
   }
   else
-      scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
+      scrypt_core_4way( (v128_t*)W, (v128_t*)scratchbuf, N );



@@ -1364,7 +1360,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,

   
   // working, simple 4 way parallel, best for scrypt
-//   scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+//   scrypt_core_4way( (v128_t*)W, (v128_t*)V, N );

 /*   
   // Working Linear single threaded SIMD
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -31,6 +31,7 @@
 #include "hmac-sha256-hash-4way.h"
 #include "compat.h"

+#if defined(__SSE2__)
 // HMAC 4-way SSE2

 /**
@@ -169,6 +170,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	}
 }

+#endif
+
 #if defined(__AVX2__)

 // HMAC 8-way AVX2
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -38,6 +38,7 @@
 #include "simd-utils.h"
 #include "sha256-hash.h"

+#if defined(__SSE2__)
 typedef struct _hmac_sha256_4way_context
 {
   sha256_4way_context ictx;
@@ -60,6 +61,8 @@ void hmac_sha256_4way_full( void*, const void *, size_t Klen, const void *,
 void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,
                         const uint8_t *, size_t, uint64_t );

+#endif
+
 #if defined(__AVX2__)

 typedef struct _hmac_sha256_8way_context
@@ -78,7 +81,9 @@ void hmac_sha256_8way_full( void*, const void *, size_t Klen, const void *,

 void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
                        const uint8_t *, size_t, uint64_t );
-      
+
+#endif  // AVX2
+       
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 typedef struct _hmac_sha256_16way_context
@@ -100,8 +105,6 @@ void pbkdf2_sha256_16way( uint8_t *, size_t, const uint8_t *, size_t,
                          const uint8_t *, size_t, uint64_t );


-
 #endif   // AVX512
-#endif   // AVX2

 #endif // HMAC_SHA256_4WAY_H__
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -666,6 +666,9 @@ bool register_sha256d_algo( algo_gate_t* gate )
 #elif defined(SHA256D_SHA)
   gate->optimizations = SHA_OPT;
   gate->scanhash = (void*)&scanhash_sha256d_sha;
+#elif defined(SHA256D_NEON_SHA2)
+   gate->optimizations = SHA_OPT;
+   gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
 //#elif defined(SHA256D_8WAY)
 //   gate->scanhash = (void*)&scanhash_sha256d_8way;
 #else
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -1,6 +1,3 @@
-
-#if defined(__SSE2__)
-
 #include <stddef.h>
 #include <string.h>
 #include "sha256-hash.h"
@@ -36,30 +33,29 @@ static const uint32_t K256[64] =
 // SHA-256 4 way SSE2

 #define CHs(X, Y, Z) \
-   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
+   v128_xor( v128_and( v128_xor( Y, Z ), X ), Z ) 

 #define MAJs(X, Y, Z) \
-  _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
-                                   Y_xor_Z ) )
+  v128_xor( Y, v128_and( X_xor_Y = v128_xor( X, Y ), Y_xor_Z ) )

 #define BSG2_0(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x,  2), mm128_ror_32(x, 13) ), mm128_ror_32( x, 22) )
+   v128_xor( v128_xor( \
+        v128_ror32(x,  2), v128_ror32(x, 13) ), v128_ror32( x, 22) )

 #define BSG2_1(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x,  6), mm128_ror_32(x, 11) ), mm128_ror_32( x, 25) )
+   v128_xor( v128_xor( \
+        v128_ror32(x,  6), v128_ror32(x, 11) ), v128_ror32( x, 25) )

 #define SSG2_0(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x,  7), mm128_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) 
+   v128_xor( v128_xor( \
+        v128_ror32(x,  7), v128_ror32(x, 18) ), v128_sr32(x, 3) ) 

 #define SSG2_1(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
+   v128_xor( v128_xor( \
+        v128_ror32(x, 17), v128_ror32(x, 19) ), v128_sr32(x, 10) )

 #define SHA2s_MEXP( a, b, c, d ) \
-  mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
+  v128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );

 #define SHA256x4_MSG_EXPANSION( W ) \
   W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
@@ -81,19 +77,19 @@ static const uint32_t K256[64] =

 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
-  __m128i T1, T2; \
-  __m128i K = v128_32( K256[( (j)+(i) )] ); \
-  T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
+  v128_t T1, T2; \
+  v128_t K = v128_32( K256[( (j)+(i) )] ); \
+  T1 = v128_add32( H, v128_add4_32( BSG2_1(E), CHs(E, F, G), \
                                        K, W[i] ) ); \
-  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
+  T2 = v128_add32( BSG2_0(A), MAJs(A, B, C) ); \
  Y_xor_Z = X_xor_Y; \
-  D  = _mm_add_epi32( D,  T1 ); \
-  H  = _mm_add_epi32( T1, T2 ); \
+  D  = v128_add32( D,  T1 ); \
+  H  = v128_add32( T1, T2 ); \
 } while (0)

 #define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
 { \
-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); \
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
@@ -113,10 +109,10 @@ do { \
 }

 // LE data, no need to byte swap
-static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
-                                          const __m128i *in )
+static inline void SHA256_4WAY_TRANSFORM( v128_t *out, v128_t *W,
+                                          const v128_t *in )
 {
-   __m128i A, B, C, D, E, F, G, H;
+   v128_t A, B, C, D, E, F, G, H;

   A = in[0];
   B = in[1];
@@ -135,109 +131,102 @@ static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
   SHA256x4_MSG_EXPANSION( W );
   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
   
-   out[0] = _mm_add_epi32( in[0], A );
-   out[1] = _mm_add_epi32( in[1], B );
-   out[2] = _mm_add_epi32( in[2], C );
-   out[3] = _mm_add_epi32( in[3], D );
-   out[4] = _mm_add_epi32( in[4], E );
-   out[5] = _mm_add_epi32( in[5], F );
-   out[6] = _mm_add_epi32( in[6], G );
-   out[7] = _mm_add_epi32( in[7], H );
+   out[0] = v128_add32( in[0], A );
+   out[1] = v128_add32( in[1], B );
+   out[2] = v128_add32( in[2], C );
+   out[3] = v128_add32( in[3], D );
+   out[4] = v128_add32( in[4], E );
+   out[5] = v128_add32( in[5], F );
+   out[6] = v128_add32( in[6], G );
+   out[7] = v128_add32( in[7], H );
 }

 // LE data, no need to byte swap
-void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
-                               const __m128i *state_in )
+void sha256_4way_transform_le( v128_t *state_out, const v128_t *data,
+                               const v128_t *state_in )
 {
-   __m128i W[16];
-   memcpy_128( W, data, 16 );
+   v128_t W[16];
+   v128_memcpy( W, data, 16 );
   SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }

 // BE data, need to byte swap input data
-void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
-                               const __m128i *state_in )
+void sha256_4way_transform_be( v128_t *state_out, const v128_t *data,
+                               const v128_t *state_in )
 {
-   __m128i W[16];
-   mm128_block_bswap_32( W, data );
-   mm128_block_bswap_32( W+8, data+8 );
+   v128_t W[16];
+   v128_block_bswap32( W, data );
+   v128_block_bswap32( W+8, data+8 );
   SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }

 // prehash_3rounds & final_rounds are not working
-void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
-                                   const __m128i *W, const __m128i *state_in )
+void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
+                                   const v128_t *W, const v128_t *state_in )
 {
-   __m128i A, B, C, D, E, F, G, H;
+   v128_t A, B, C, D, E, F, G, H;

   // precalculate constant part msg expansion for second iteration.
   X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
   X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
-   X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ),
-                          W[ 2] );
-   X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ),
-                          SSG2_0( W[ 4] ) );
-   X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ),
-                          W[ 4] );
-   X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ),
-                          W[ 5] );
-   X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ),
-                          W[ 6] );
-   X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ),
-                          W[ 7] );
-   X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ),
-                          W[ 8] );
-   X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] );
-   X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] );
-   X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] );
-   X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] );
-   X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] );
-   X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] );
-   X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] );
+   X[ 2] = v128_add32( v128_add32( SSG2_1( X[ 0] ), W[11] ), W[ 2] );
+   X[ 3] = v128_add32( v128_add32( SSG2_1( X[ 1] ), W[12] ), SSG2_0( W[ 4] ) );
+   X[ 4] = v128_add32( v128_add32( W[13], SSG2_0( W[ 5] ) ), W[ 4] );
+   X[ 5] = v128_add32( v128_add32( W[14], SSG2_0( W[ 6] ) ), W[ 5] );
+   X [6] = v128_add32( v128_add32( W[15], SSG2_0( W[ 7] ) ), W[ 6] );
+   X[ 7] = v128_add32( v128_add32( X[ 0], SSG2_0( W[ 8] ) ), W[ 7] );
+   X[ 8] = v128_add32( v128_add32( X[ 1], SSG2_0( W[ 9] ) ), W[ 8] );
+   X[ 9] = v128_add32( SSG2_0( W[10] ), W[ 9] );
+   X[10] = v128_add32( SSG2_0( W[11] ), W[10] );
+   X[11] = v128_add32( SSG2_0( W[12] ), W[11] );
+   X[12] = v128_add32( SSG2_0( W[13] ), W[12] );
+   X[13] = v128_add32( SSG2_0( W[14] ), W[13] );
+   X[14] = v128_add32( SSG2_0( W[15] ), W[14] );
+   X[15] = v128_add32( SSG2_0( X[ 0] ), W[15] );

-   A = _mm_load_si128( state_in     );
-   B = _mm_load_si128( state_in + 1 );
-   C = _mm_load_si128( state_in + 2 );
-   D = _mm_load_si128( state_in + 3 );
-   E = _mm_load_si128( state_in + 4 );
-   F = _mm_load_si128( state_in + 5 );
-   G = _mm_load_si128( state_in + 6 );
-   H = _mm_load_si128( state_in + 7 );
+   A = v128_load( state_in     );
+   B = v128_load( state_in + 1 );
+   C = v128_load( state_in + 2 );
+   D = v128_load( state_in + 3 );
+   E = v128_load( state_in + 4 );
+   F = v128_load( state_in + 5 );
+   G = v128_load( state_in + 6 );
+   H = v128_load( state_in + 7 );

-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
   
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   
-   _mm_store_si128( state_mid    , A );
-   _mm_store_si128( state_mid + 1, B );
-   _mm_store_si128( state_mid + 2, C );
-   _mm_store_si128( state_mid + 3, D );
-   _mm_store_si128( state_mid + 4, E );
-   _mm_store_si128( state_mid + 5, F );
-   _mm_store_si128( state_mid + 6, G );
-   _mm_store_si128( state_mid + 7, H );
+   v128_store( state_mid    , A );
+   v128_store( state_mid + 1, B );
+   v128_store( state_mid + 2, C );
+   v128_store( state_mid + 3, D );
+   v128_store( state_mid + 4, E );
+   v128_store( state_mid + 5, F );
+   v128_store( state_mid + 6, G );
+   v128_store( state_mid + 7, H );
 }

-void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
-          const __m128i *state_in, const __m128i *state_mid, const __m128i *X )
+void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
+          const v128_t *state_in, const v128_t *state_mid, const v128_t *X )
 {
-   __m128i A, B, C, D, E, F, G, H;
-   __m128i W[16];
+   v128_t A, B, C, D, E, F, G, H;
+   v128_t W[16];

-   memcpy_128( W, data, 16 );
+   v128_memcpy( W, data, 16 );

-   A = _mm_load_si128( state_mid     );
-   B = _mm_load_si128( state_mid + 1 );
-   C = _mm_load_si128( state_mid + 2 );
-   D = _mm_load_si128( state_mid + 3 );
-   E = _mm_load_si128( state_mid + 4 );
-   F = _mm_load_si128( state_mid + 5 );
-   G = _mm_load_si128( state_mid + 6 );
-   H = _mm_load_si128( state_mid + 7 );
+   A = v128_load( state_mid     );
+   B = v128_load( state_mid + 1 );
+   C = v128_load( state_mid + 2 );
+   D = v128_load( state_mid + 3 );
+   E = v128_load( state_mid + 4 );
+   F = v128_load( state_mid + 5 );
+   G = v128_load( state_mid + 6 );
+   H = v128_load( state_mid + 7 );

-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H );
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( G, H );

   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
@@ -256,27 +245,20 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
   // update precalculated msg expansion with new nonce: W[3].
   W[ 0] = X[ 0];
   W[ 1] = X[ 1];
-   W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) );
-   W[ 3] = _mm_add_epi32( X[ 3], W[ 3] );
-   W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) );
-   W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) );
-   W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) );
-   W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) );
-   W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) );
-   W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ),
-                                                W[ 2] ) );
-   W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ),
-                                                W[ 3] ) );
-   W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ),
-                                                W[ 4] ) );
-   W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ),
-                                                W[ 5] ) );
-   W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ),
-                                                W[ 6] ) );
-   W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ),
-                                                W[ 7] ) );
-   W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ),
-                                                W[ 8] ) );
+   W[ 2] = v128_add32( X[ 2], SSG2_0( W[ 3] ) );
+   W[ 3] = v128_add32( X[ 3], W[ 3] );
+   W[ 4] = v128_add32( X[ 4], SSG2_1( W[ 2] ) );
+   W[ 5] = v128_add32( X[ 5], SSG2_1( W[ 3] ) );
+   W[ 6] = v128_add32( X[ 6], SSG2_1( W[ 4] ) );
+   W[ 7] = v128_add32( X[ 7], SSG2_1( W[ 5] ) );
+   W[ 8] = v128_add32( X[ 8], SSG2_1( W[ 6] ) );
+   W[ 9] = v128_add32( X[ 9], v128_add32( SSG2_1( W[ 7] ), W[ 2] ) );
+   W[10] = v128_add32( X[10], v128_add32( SSG2_1( W[ 8] ), W[ 3] ) );
+   W[11] = v128_add32( X[11], v128_add32( SSG2_1( W[ 9] ), W[ 4] ) );
+   W[12] = v128_add32( X[12], v128_add32( SSG2_1( W[10] ), W[ 5] ) );
+   W[13] = v128_add32( X[13], v128_add32( SSG2_1( W[11] ), W[ 6] ) );
+   W[14] = v128_add32( X[14], v128_add32( SSG2_1( W[12] ), W[ 7] ) );
+   W[15] = v128_add32( X[15], v128_add32( SSG2_1( W[13] ), W[ 8] ) );

   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
   SHA256x4_MSG_EXPANSION( W );
@@ -284,45 +266,47 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
   SHA256x4_MSG_EXPANSION( W );
   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );

-   A = _mm_add_epi32( A, _mm_load_si128( state_in     ) );
-   B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) );
-   C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) );
-   D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) );
-   E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) );
-   F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) );
-   G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) );
-   H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) );
+   A = v128_add32( A, v128_load( state_in     ) );
+   B = v128_add32( B, v128_load( state_in + 1 ) );
+   C = v128_add32( C, v128_load( state_in + 2 ) );
+   D = v128_add32( D, v128_load( state_in + 3 ) );
+   E = v128_add32( E, v128_load( state_in + 4 ) );
+   F = v128_add32( F, v128_load( state_in + 5 ) );
+   G = v128_add32( G, v128_load( state_in + 6 ) );
+   H = v128_add32( H, v128_load( state_in + 7 ) );

-   _mm_store_si128( state_out    ,  A );
-   _mm_store_si128( state_out + 1,  B );
-   _mm_store_si128( state_out + 2,  C );
-   _mm_store_si128( state_out + 3,  D );
-   _mm_store_si128( state_out + 4,  E );
-   _mm_store_si128( state_out + 5,  F );
-   _mm_store_si128( state_out + 6,  G );
-   _mm_store_si128( state_out + 7,  H );
+   v128_store( state_out    ,  A );
+   v128_store( state_out + 1,  B );
+   v128_store( state_out + 2,  C );
+   v128_store( state_out + 3,  D );
+   v128_store( state_out + 4,  E );
+   v128_store( state_out + 5,  F );
+   v128_store( state_out + 6,  G );
+   v128_store( state_out + 7,  H );
 }

+# if 0
+
 // Working correctly but still slower
-int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                            const __m128i *state_in, const uint32_t *target )
+int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
+                            const v128_t *state_in, const uint32_t *target )
 {
-   __m128i A, B, C, D, E, F, G, H, T0, T1, T2;
-   __m128i vmask, targ, hash;
+   v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
+   v128_t vmask, targ, hash;
   int t6_mask, flip;
-   __m128i W[16];      memcpy_128( W, data, 16 );
+   v128_t W[16];      memcpy_128( W, data, 16 );

-   A = _mm_load_si128( state_in   );
-   B = _mm_load_si128( state_in+1 );
-   C = _mm_load_si128( state_in+2 );
-   D = _mm_load_si128( state_in+3 );
-   E = _mm_load_si128( state_in+4 );
-   F = _mm_load_si128( state_in+5 );
-   G = _mm_load_si128( state_in+6 );
-   H = _mm_load_si128( state_in+7 );
+   A = v128_load( state_in   );
+   B = v128_load( state_in+1 );
+   C = v128_load( state_in+2 );
+   D = v128_load( state_in+3 );
+   E = v128_load( state_in+4 );
+   F = v128_load( state_in+5 );
+   G = v128_load( state_in+6 );
+   H = v128_load( state_in+7 );

-   const __m128i IV7 = H;
-   const __m128i IV6 = G;
+   const v128_t IV7 = H;
+   const v128_t IV6 = G;

   SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
   SHA256x4_MSG_EXPANSION( W );
@@ -344,7 +328,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
   W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] );
   W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] );

-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
   
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
@@ -357,65 +341,64 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 48 );

-   T0 = _mm_add_epi32( v128_32( K256[58] ),
-                   mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
-   B = _mm_add_epi32( B, T0 );
+   T0 = v128_add32( v128_32( K256[58] ),
+                   v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
+   B = v128_add32( B, T0 );

-   T1 = _mm_add_epi32( v128_32( K256[59] ),
-                    mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
-   A = _mm_add_epi32( A, T1 );
+   T1 = v128_add32( v128_32( K256[59] ),
+                    v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
+   A = v128_add32( A, T1 );

-   T2 = _mm_add_epi32( v128_32( K256[60] ),
-                    mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
-   H = _mm_add_epi32( H, T2 );
+   T2 = v128_add32( v128_32( K256[60] ),
+                    v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
+   H = v128_add32( H, T2 );

   targ = v128_32( target[7] );
-   hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
+   hash = v128_bswap32( v128_add32( H, IV7 ) );

-   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
+   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );

-   if ( likely( 0xf == ( flip ^
-                    mm128_movmask_32( _mm_cmpgt_epi32( hash, targ ) ) ) ))
+   if ( likely(
+             0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
   return 0;

-   t6_mask = mm128_movmask_32( vmask =_mm_cmpeq_epi32( hash, targ ) );
+   t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );

   // round 58 part 2
-   F = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( G ), MAJs( G, H, A ) ) );
+   F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );

   // round 61  part 1
   W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm_add_epi32( v128_32( K256[61] ),
-                 mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
-   G = _mm_add_epi32( G, T0 );
+   T0 = v128_add32( v128_32( K256[61] ),
+                    v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
+   G = v128_add32( G, T0 );

   if ( t6_mask )
   {
-      targ = _mm_and_si128( vmask, v128_32( target[6] ) );
-      hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
+      targ = v128_and( vmask, v128_32( target[6] ) );
+      hash = v128_bswap32( v128_add32( G, IV6 ) );

-      if ( ( 0 != ( t6_mask & mm128_movmask_32(
-                                 _mm_cmpeq_epi32( hash, targ ) ) ) ))
+      if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
         return 0;
      else
      {
-         flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
-         if ( 0 != ( t6_mask & ( flip ^ mm128_movmask_32(
-                                       _mm_cmpgt_epi32( hash, targ ) ) ) ) )
+         flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
+         if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
+                                             v128_cmpgt32( hash, targ ) ) ) ) )
            return 0;
          else if ( target[6] == 0x80000000 )
          {
-             if ( 0 == ( t6_mask & mm128_movmask_32(
-                   _mm_cmpgt_epi32( hash, _mm_xor_si128( hash, hash ) ) ) ) )
+             if ( 0 == ( t6_mask & v128_movmask32(
+                            v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
                return 0;
          }
       }
   }
   
   // rounds 59 to 61 part 2
-   E = _mm_add_epi32( T1, _mm_add_epi32( BSG2_0( F ), MAJs( F, G, H ) ) );
-   D = _mm_add_epi32( T2, _mm_add_epi32( BSG2_0( E ), MAJs( E, F, G ) ) );
-   C = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( D ), MAJs( D, E, F ) ) );
+   E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
+   D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
+   C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );

   // rounds 62 & 63
   W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] );
@@ -424,17 +407,18 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );

-   state_out[0] = _mm_add_epi32( state_in[0], A );
-   state_out[1] = _mm_add_epi32( state_in[1], B );
-   state_out[2] = _mm_add_epi32( state_in[2], C );
-   state_out[3] = _mm_add_epi32( state_in[3], D );
-   state_out[4] = _mm_add_epi32( state_in[4], E );
-   state_out[5] = _mm_add_epi32( state_in[5], F );
-   state_out[6] = _mm_add_epi32( state_in[6], G );
-   state_out[7] = _mm_add_epi32( state_in[7], H );
+   state_out[0] = v128_add32( state_in[0], A );
+   state_out[1] = v128_add32( state_in[1], B );
+   state_out[2] = v128_add32( state_in[2], C );
+   state_out[3] = v128_add32( state_in[3], D );
+   state_out[4] = v128_add32( state_in[4], E );
+   state_out[5] = v128_add32( state_in[5], F );
+   state_out[6] = v128_add32( state_in[6], G );
+   state_out[7] = v128_add32( state_in[7], H );
 return 1;
 }

+#endif

 void sha256_4way_init( sha256_4way_context *sc )
 {
@@ -451,7 +435,7 @@ void sha256_4way_init( sha256_4way_context *sc )

 void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
 {
-   __m128i *vdata = (__m128i*)data;
+   v128_t *vdata = (v128_t*)data;
   size_t ptr;
   const int buf_size = 64;

@@ -464,7 +448,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
+      v128_memcpy( sc->buf + (ptr>>2), vdata, clen>>2 );
      vdata = vdata + (clen>>2);
      ptr += clen;
      len -= clen;
@@ -494,12 +478,12 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )

    if ( ptr > pad )
    {
-         memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+         v128_memset_zero( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
         sha256_4way_transform_be( sc->val, sc->buf, sc->val );
-         memset_zero_128( sc->buf, pad >> 2 );
+         v128_memset_zero( sc->buf, pad >> 2 );
    }
    else
-         memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+         v128_memset_zero( sc->buf + (ptr>>2), (pad - ptr) >> 2 );

    low = sc->count_low;
    high = (sc->count_high << 3) | (low >> 29);
@@ -509,7 +493,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
    sha256_4way_transform_be( sc->val, sc->buf, sc->val );

-    mm128_block_bswap_32( dst, sc->val );
+    v128_block_bswap32( dst, sc->val );
 }

 void sha256_4way_full( void *dst, const void *data, size_t len )
@@ -1725,4 +1709,3 @@ void sha256_16way_full( void *dst, const void *data, size_t len )

 #endif  // AVX512
 #endif  // __AVX2__
-#endif  // __SSE2__
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -25,7 +25,7 @@ void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
 void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
                          const uint32_t *state_in );

-#if defined(__SHA__)
+#if defined(__x86_64__) && defined(__SHA__)

 void sha256_opt_transform_le( uint32_t *state_out, const void *input,
                           const uint32_t *state_in );
@@ -33,34 +33,67 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
 void sha256_opt_transform_be( uint32_t *state_out, const void *input,
                           const uint32_t *state_in );

-// 2 way with interleaved instructions
-void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
+// 2 way serial with interleaved instructions
+void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
                              const void *msg_X, const void *msg_Y,
                              const uint32_t *in_X, const uint32_t *in_Y );

-void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
                              const void *msg_X, const void *msg_Y,
                              const uint32_t *in_X, const uint32_t *in_Y );

 void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
                              uint32_t *sstate, const uint32_t *istate );

-void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
+void sha256_ni2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
                 const void *msg_X, const void *msg_Y,
                 const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
                 const uint32_t *state_save_X, const uint32_t *state_save_Y );

-// Select target
-// with SHA...
-#define sha256_transform_le sha256_opt_transform_le
-#define sha256_transform_be sha256_opt_transform_be
+#define sha256_transform_le         sha256_opt_transform_le
+#define sha256_transform_be         sha256_opt_transform_be
+#define sha256_2x_transform_le      sha256_ni2x_transform_le
+#define sha256_2x_transform_be      sha256_ni2x_transform_be
+#define sha256_prehash_3rounds      sha256_ni_prehash_3rounds
+#define sha256_2x_final_rounds      sha256_ni2x_final_rounds
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+void sha256_neon_transform_be( uint32_t *state_out, const void *input,
+                               const uint32_t *state_in );
+void sha256_neon_transform_le( uint32_t *state_out, const void *input,
+                               const uint32_t *state_in );
+
+void sha256_neon2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+void sha256_neon2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+void sha256_neon_prehash_3rounds( uint32_t *ostate, const void *msg,
+                              uint32_t *sstate, const uint32_t *istate );
+
+void sha256_neon2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
+                 const void *msg_X, const void *msg_Y,
+                 const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
+                 const uint32_t *state_save_X, const uint32_t *state_save_Y );
+
+#define sha256_transform_le         sha256_neon_transform_le
+#define sha256_transform_be         sha256_neon_transform_be
+#define sha256_2x_transform_le      sha256_neon2x_transform_le
+#define sha256_2x_transform_be      sha256_neon2x_transform_be
+#define sha256_prehash_3rounds      sha256_neon_prehash_3rounds
+#define sha256_2x_final_rounds      sha256_neon2x_final_rounds

 #else
-// without SHA...
+// without HW acceleration...
 #include "sph_sha2.h"

-#define sha256_transform_le sph_sha256_transform_le
-#define sha256_transform_be sph_sha256_transform_be
+#define sha256_transform_le         sph_sha256_transform_le
+#define sha256_transform_be         sph_sha256_transform_be
+#define sha256_prehash_3rounds      sph_sha256_prehash_3rounds

 #endif

@@ -122,14 +155,12 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,

 #endif  // AVX2

-#if defined(__SSE2__)
-
 // SHA-256 4 way

 typedef struct
 {
-   __m128i buf[64>>2];
-   __m128i val[8];
+   v128_t buf[64>>2];
+   v128_t val[8];
   uint32_t count_high, count_low;
 } sha256_4way_context __attribute__ ((aligned (32)));

@@ -138,17 +169,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
                         size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sha256_4way_full( void *dst, const void *data, size_t len );
-void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
-                                   const __m128i *W, const __m128i *state_in );
-void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
-        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
-int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                                   const __m128i *state_in, const uint32_t *target );
+void sha256_4way_transform_le( v128_t *state_out,  const v128_t *data,
+                            const v128_t *state_in );
+void sha256_4way_transform_be( v128_t *state_out,  const v128_t *data,
+                            const v128_t *state_in );
+void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
+                                   const v128_t *W, const v128_t *state_in );
+void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
+        const v128_t *state_in, const v128_t *state_mid, const v128_t *X );
+int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
+                                   const v128_t *state_in, const uint32_t *target );

-#endif  // SSE2

 #endif
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -32,11 +32,11 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+   const v128_t shuf_bswap32 =
+           v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

   // hash first 64 byte block of data
-   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+   sha256_transform_le( mstatea, pdata, sha256_iv );

   // fill & pad second bock without nonce
   memcpy( block1a, pdata + 16, 12 );
@@ -48,7 +48,7 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
   memset( block1b + 5, 0, 40 );
   block1a[15] = block1b[15] = 80*8; // bit count

-   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+   sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);

   // Pad third block
   block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -61,18 +61,18 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
      // Insert nonce for second block
      block1a[3] = n;
      block1b[3] = n+1;
-      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+      sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
                                  mstateb, mstateb, sstate, sstate );

-      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+      sha256_2x_transform_le( hasha, hashb, block2a, block2b,
                                  sha256_iv, sha256_iv );

      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_m128i( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
-          casti_m128i( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
+          casti_v128( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
@@ -81,10 +81,94 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
      }
      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_m128i( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
-         casti_m128i( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
+         casti_v128( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256D_NEON_SHA2)
+
+int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const v128_t shuf_bswap32 =
+           v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 80*8; // bit count
+
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 32*8; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
+                                  mstatea, mstatea );
+
+      sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
@@ -282,11 +366,11 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]    __attribute__ ((aligned (64)));
-   __m128i  block[16]    __attribute__ ((aligned (32)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  istate[8] __attribute__ ((aligned (32)));
-   __m128i  mstate[8]  __attribute__ ((aligned (32)));
+   v128_t  vdata[32]    __attribute__ ((aligned (64)));
+   v128_t  block[16]    __attribute__ ((aligned (32)));
+   v128_t  hash32[8]    __attribute__ ((aligned (32)));
+   v128_t  istate[8] __attribute__ ((aligned (32)));
+   v128_t  mstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
@@ -295,23 +379,23 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   __m128i *noncev = vdata + 19;
+   v128_t *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = v128_32( 0x80000000 );
-   const __m128i four = v128_32( 4 );
+   const v128_t last_byte = v128_32( 0x80000000 );
+   const v128_t four = v128_32( 4 );

   for ( int i = 0; i < 19; i++ )
       vdata[i] = v128_32( pdata[i] );

-   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+   *noncev = v128_set_32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
-   memset_zero_128( vdata+16 + 5, 10 );
+   v128_memset_zero( vdata+16 + 5, 10 );
   vdata[16+15] = v128_32( 80*8 );

   block[ 8] = last_byte;
-   memset_zero_128( block + 9, 6 );
+   v128_memset_zero( block + 9, 6 );
   block[15] = v128_32( 32*8 );
   
   // initialize state
@@ -332,7 +416,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
      sha256_4way_transform_le( block,  vdata+16, mstate  );
      sha256_4way_transform_le( hash32, block, istate );

-      mm128_block_bswap_32( hash32, hash32 );
+      v128_block_bswap32( hash32, hash32 );

      for ( int lane = 0; lane < 4; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -344,7 +428,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
            submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm_add_epi32( *noncev, four );
+      *noncev = v128_add32( *noncev, four );
      n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -8,6 +8,8 @@
  #define SHA256D_16WAY 1
 #elif defined(__SHA__)
  #define SHA256D_SHA 1
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
+  #define SHA256D_NEON_SHA2 1
 #elif defined(__AVX2__)
  #define SHA256D_8WAY 1
 #else
@@ -41,5 +43,12 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,

 #endif

+#if defined(SHA256D_NEON_SHA2)
+
+int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
 #endif

--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -9,6 +9,8 @@
  #define SHA256DT_16WAY 1
 #elif defined(__SHA__)
  #define SHA256DT_SHA 1
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
+  #define SHA256DT_NEON_SHA2 1
 #elif defined(__AVX2__)
  #define SHA256DT_8WAY 1
 #else
@@ -42,11 +44,11 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+   const v128_t shuf_bswap32 =
+           v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

   // hash first 64 byte block of data
-   sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
+   sha256_transform_le( mstatea, pdata, sha256dt_iv );

   // fill & pad second bock without nonce
   memcpy( block1a, pdata + 16, 12 );
@@ -57,7 +59,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
   memset( block1b + 5, 0, 40 );
   block1a[15] = block1b[15] = 0x480; // funky bit count

-   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+   sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);

   // Pad third block
   block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -70,18 +72,16 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
      // Insert nonce for second block
      block1a[3] = n;
      block1b[3] = n+1;
-      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+      sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
                                  mstateb, mstateb, sstate, sstate );

-      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+      sha256_2x_transform_le( hasha, hashb, block2a, block2b,
                                  sha256dt_iv, sha256dt_iv );

      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_m128i( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
-          casti_m128i( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
@@ -90,10 +90,92 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
      }
      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_m128i( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
-         casti_m128i( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256DT_NEON_SHA2)
+
+#pragma message  "SHA256DT  MEON SHA"
+
+int scanhash_sha256dt_neon_sha2( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const v128_t shuf_bswap32 =
+           v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_neon_transform_le( mstatea, pdata, sha256dt_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 0x300; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
+                                  mstatea, mstatea );
+
+      sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256dt_iv, sha256dt_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
@@ -132,7 +214,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
@@ -227,7 +309,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
@@ -291,11 +373,11 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]    __attribute__ ((aligned (64)));
-   __m128i  block[16]    __attribute__ ((aligned (32)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  initstate[8] __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   v128_t  vdata[32]    __attribute__ ((aligned (64)));
+   v128_t  block[16]    __attribute__ ((aligned (32)));
+   v128_t  hash32[8]    __attribute__ ((aligned (32)));
+   v128_t  initstate[8] __attribute__ ((aligned (32)));
+   v128_t  midstate[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
@@ -304,23 +386,23 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   __m128i *noncev = vdata + 19;
+   v128_t *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = v128_32( 0x80000000 );
-   const __m128i four = v128_32( 4 );
+   const v128_t last_byte = v128_32( 0x80000000 );
+   const v128_t four = v128_32( 4 );

   for ( int i = 0; i < 19; i++ )
       vdata[i] = v128_32( pdata[i] );

-   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+   *noncev = v128_set32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
-   memset_zero_128( vdata+16 + 5, 10 );
+   v128_memset_zero( vdata+16 + 5, 10 );
   vdata[16+15] = v128_32( 0x480 );

   block[ 8] = last_byte;
-   memset_zero_128( block + 9, 6 );
+   v128_memset_zero( block + 9, 6 );
   block[15] = v128_32( 0x300 );
   
   // initialize state
@@ -341,7 +423,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
      sha256_4way_transform_le( block,  vdata+16, midstate  );
      sha256_4way_transform_le( hash32, block, initstate );

-      mm128_block_bswap_32( hash32, hash32 );
+      v128_block_bswap32( hash32, hash32 );

      for ( int lane = 0; lane < 4; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -353,7 +435,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
            submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm_add_epi32( *noncev, four );
+      *noncev = v128_add32( *noncev, four );
      n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
@@ -371,11 +453,16 @@ bool register_sha256dt_algo( algo_gate_t* gate )
 #elif defined(SHA256DT_SHA)
    gate->optimizations = SHA_OPT;
    gate->scanhash = (void*)&scanhash_sha256dt_sha;    
+#elif defined(SHA256DT_NEON_SHA2)
+    gate->optimizations = SHA_OPT;
+    gate->scanhash = (void*)&scanhash_sha256dt_neon_sha2;
 #elif defined(SHA256DT_8WAY)
    gate->scanhash = (void*)&scanhash_sha256dt_8way;
-#else
+#elif defined(SHA256DT_4WAY)
    gate->scanhash = (void*)&scanhash_sha256dt_4way;
 #endif
+
+
    return true;
 }

--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -188,7 +188,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   v128_t  *noncev = (v128_t*)vdata + 19;   // aligned
   int thr_id = mythr->id;  // thr_id arg is deprecated

   const uint64_t htmax[] = {          0,
@@ -204,7 +204,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                               0xFFFF0000,
                                        0 };

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
   sha256_4way_init( &sha256_ctx4 );
   sha256_4way_update( &sha256_ctx4, vdata, 64 );

@@ -212,7 +212,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
   {
      uint32_t mask = masks[m];
      do {
-         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+         *noncev = v128_bswap32( v128_set32( n+3,n+2,n+1,n ) );
         pdata[19] = n;

         sha256q_4way_hash( hash, vdata );
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -45,7 +45,7 @@ int scanhash_sha256q( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;

-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
   sha256q_midstate( edata );

   do
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -131,11 +131,11 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+//   const v128_t shuf_bswap32 =
+//           v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

   // hash first 64 byte block of data
-   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+   sha256_transform_le( mstatea, pdata, sha256_iv );

   // fill & pad second bock without nonce
   memcpy( block1a, pdata + 16, 12 );
@@ -147,7 +147,7 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
   memset( block1b + 5, 0, 40 );
   block1a[15] = block1b[15] = 0x480; // funky bit count

-   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+   sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);

   // Pad third block
   block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -160,19 +160,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
      // Insert nonce for second block
      block1a[3] = n;
      block1b[3] = n+1;
-      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+      sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
                                  mstateb, mstateb, sstate, sstate );
-      sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
+      sha256_2x_transform_le( block2a, block2b, block2a, block2b,
                                  sha256_iv, sha256_iv );
-      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+      sha256_2x_transform_le( hasha, hashb, block2a, block2b,
                                  sha256_iv, sha256_iv );

      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_m128i( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
-          casti_m128i( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
@@ -181,10 +179,90 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
      }
      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_m128i( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
-         casti_m128i( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256T_NEON_SHA2)
+
+int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]   __attribute__ ((aligned (32)));
+   uint32_t hashb[8]   __attribute__ ((aligned (32)));
+   uint32_t mstatea[8] __attribute__ ((aligned (32)));
+   uint32_t sstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   // hash first 64 byte block of data
+   sha256_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 80*8; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
+                                  mstatea, mstatea );
+      sha256_neon2x_transform_le( block2a, block2b, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+      sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
@@ -295,13 +373,13 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]    __attribute__ ((aligned (64)));
-   __m128i  block[16]    __attribute__ ((aligned (32)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  istate[8]    __attribute__ ((aligned (32)));
-   __m128i  mstate[8]   __attribute__ ((aligned (32)));
-//   __m128i  mstate2[8]   __attribute__ ((aligned (32)));
-//   __m128i  mexp_pre[8]  __attribute__ ((aligned (32)));
+   v128_t  vdata[32]    __attribute__ ((aligned (64)));
+   v128_t  block[16]    __attribute__ ((aligned (32)));
+   v128_t  hash32[8]    __attribute__ ((aligned (32)));
+   v128_t  istate[8]    __attribute__ ((aligned (32)));
+   v128_t  mstate[8]   __attribute__ ((aligned (32)));
+//   v128_t  mstate2[8]   __attribute__ ((aligned (32)));
+//   v128_t  mexp_pre[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
@@ -310,23 +388,23 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   __m128i *noncev = vdata + 19;
+   v128_t *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = v128_32( 0x80000000 );
-   const __m128i four = v128_32( 4 );
+   const v128_t last_byte = v128_32( 0x80000000 );
+   const v128_t four = v128_32( 4 );

   for ( int i = 0; i < 19; i++ )
       vdata[i] = v128_32( pdata[i] );

-   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+   *noncev = v128_set_32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
-   memset_zero_128( vdata+16 + 5, 10 );
+   v128_memset_zero( vdata+16 + 5, 10 );
   vdata[16+15] = v128_32( 80*8 ); // bit count

   block[ 8] = last_byte;
-   memset_zero_128( block + 9, 6 );
+   v128_memset_zero( block + 9, 6 );
   block[15] = v128_32( 32*8 ); // bit count
   
   // initialize state
@@ -353,10 +431,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
      sha256_4way_transform_le( block,  block, istate );
      sha256_4way_transform_le( hash32, block, istate );

-//      if ( unlikely( sha256_4way_transform_le_short(
-//                                  hash32, block, initstate, ptarget ) ))
-//      {
-         mm128_block_bswap_32( hash32, hash32 );
+         v128_block_bswap32( hash32, hash32 );
         for ( int lane = 0; lane < 4; lane++ )
         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
         {
@@ -367,8 +442,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
               submit_solution( work, lane_hash, mythr );
            }
         }
-//       }
-       *noncev = _mm_add_epi32( *noncev, four );
+       *noncev = v128_add32( *noncev, four );
       n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -10,8 +10,11 @@ bool register_sha256t_algo( algo_gate_t* gate )
    gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
-#else
+#elif defined(SHA256T_4WAY)
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
+#else
+    gate->scanhash   = (void*)&scanhash_sha256t;
+
 #endif
    return true;
 }
@@ -22,16 +25,19 @@ bool register_sha256q_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
    gate->scanhash   = (void*)&scanhash_sha256q_16way;
    gate->hash       = (void*)&sha256q_16way_hash;
-#elif defined(SHA256T_SHA)
-    gate->optimizations = SHA_OPT;
-    gate->scanhash   = (void*)&scanhash_sha256q;
-    gate->hash       = (void*)&sha256q_hash;
+//#elif defined(SHA256T_SHA)
+//    gate->optimizations = SHA_OPT;
+//    gate->scanhash   = (void*)&scanhash_sha256q;
+//    gate->hash       = (void*)&sha256q_hash;
 #elif defined(SHA256T_8WAY)
    gate->scanhash   = (void*)&scanhash_sha256q_8way;
    gate->hash       = (void*)&sha256q_8way_hash;
-#else
+#elif defined(SHA256T_4WAY)
    gate->scanhash   = (void*)&scanhash_sha256q_4way;
    gate->hash       = (void*)&sha256q_4way_hash;
+//#else
+//    gate->scanhash   = (void*)&scanhash_sha256q;
+//    gate->hash       = (void*)&sha256q_4way;
 #endif
    return true;
 }
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -8,6 +8,8 @@
  #define SHA256T_16WAY 1
 #elif defined(__SHA__)
  #define SHA256T_SHA 1
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
+  #define SHA125DT_NEON_SHA2 1
 #elif defined(__AVX2__)
  #define SHA256T_8WAY 1
 #else
@@ -51,6 +53,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,

 #endif

+#if defined(SHA256T_NEON_SHA2)
+
+int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+int sha256t_hash( void *output, const void *input );
+int scanhash_sha256t( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
 int sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>

-#ifdef __SSE4_1__
+#if defined(__SSE4_1__) || defined(__ARM_NEON)

 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -1245,16 +1245,16 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #endif  // AVX2

 #define DECL_STATE   \
-	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
+	v128_t A0, A1, A2, A3, A4, A5, A6, A7, \
 	        A8, A9, AA, AB; \
-	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
+	v128_t B0, B1, B2, B3, B4, B5, B6, B7, \
 	        B8, B9, BA, BB, BC, BD, BE, BF; \
-	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
+	v128_t C0, C1, C2, C3, C4, C5, C6, C7, \
 	        C8, C9, CA, CB, CC, CD, CE, CF; \
-	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
+	v128_t M0, M1, M2, M3, M4, M5, M6, M7, \
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
-   const __m128i FIVE  = v128_32( 5 ); \
-   const __m128i THREE = v128_32( 3 ); \
+   const v128_t FIVE  = v128_32( 5 ); \
+   const v128_t THREE = v128_32( 3 ); \
   uint32_t Wlow, Whigh;

 #define READ_STATE(state) do \
@@ -1429,96 +1429,84 @@ do { \

 #define INPUT_BLOCK_ADD \
 do { \
-    B0 = _mm_add_epi32( B0, M0 );\
-    B1 = _mm_add_epi32( B1, M1 );\
-    B2 = _mm_add_epi32( B2, M2 );\
-    B3 = _mm_add_epi32( B3, M3 );\
-    B4 = _mm_add_epi32( B4, M4 );\
-    B5 = _mm_add_epi32( B5, M5 );\
-    B6 = _mm_add_epi32( B6, M6 );\
-    B7 = _mm_add_epi32( B7, M7 );\
-    B8 = _mm_add_epi32( B8, M8 );\
-    B9 = _mm_add_epi32( B9, M9 );\
-    BA = _mm_add_epi32( BA, MA );\
-    BB = _mm_add_epi32( BB, MB );\
-    BC = _mm_add_epi32( BC, MC );\
-    BD = _mm_add_epi32( BD, MD );\
-    BE = _mm_add_epi32( BE, ME );\
-    BF = _mm_add_epi32( BF, MF );\
+    B0 = v128_add32( B0, M0 );\
+    B1 = v128_add32( B1, M1 );\
+    B2 = v128_add32( B2, M2 );\
+    B3 = v128_add32( B3, M3 );\
+    B4 = v128_add32( B4, M4 );\
+    B5 = v128_add32( B5, M5 );\
+    B6 = v128_add32( B6, M6 );\
+    B7 = v128_add32( B7, M7 );\
+    B8 = v128_add32( B8, M8 );\
+    B9 = v128_add32( B9, M9 );\
+    BA = v128_add32( BA, MA );\
+    BB = v128_add32( BB, MB );\
+    BC = v128_add32( BC, MC );\
+    BD = v128_add32( BD, MD );\
+    BE = v128_add32( BE, ME );\
+    BF = v128_add32( BF, MF );\
 } while (0)

 #define INPUT_BLOCK_SUB \
 do { \
-    C0 = _mm_sub_epi32( C0, M0 ); \
-    C1 = _mm_sub_epi32( C1, M1 ); \
-    C2 = _mm_sub_epi32( C2, M2 ); \
-    C3 = _mm_sub_epi32( C3, M3 ); \
-    C4 = _mm_sub_epi32( C4, M4 ); \
-    C5 = _mm_sub_epi32( C5, M5 ); \
-    C6 = _mm_sub_epi32( C6, M6 ); \
-    C7 = _mm_sub_epi32( C7, M7 ); \
-    C8 = _mm_sub_epi32( C8, M8 ); \
-    C9 = _mm_sub_epi32( C9, M9 ); \
-    CA = _mm_sub_epi32( CA, MA ); \
-    CB = _mm_sub_epi32( CB, MB ); \
-    CC = _mm_sub_epi32( CC, MC ); \
-    CD = _mm_sub_epi32( CD, MD ); \
-    CE = _mm_sub_epi32( CE, ME ); \
-    CF = _mm_sub_epi32( CF, MF ); \
+    C0 = v128_sub32( C0, M0 ); \
+    C1 = v128_sub32( C1, M1 ); \
+    C2 = v128_sub32( C2, M2 ); \
+    C3 = v128_sub32( C3, M3 ); \
+    C4 = v128_sub32( C4, M4 ); \
+    C5 = v128_sub32( C5, M5 ); \
+    C6 = v128_sub32( C6, M6 ); \
+    C7 = v128_sub32( C7, M7 ); \
+    C8 = v128_sub32( C8, M8 ); \
+    C9 = v128_sub32( C9, M9 ); \
+    CA = v128_sub32( CA, MA ); \
+    CB = v128_sub32( CB, MB ); \
+    CC = v128_sub32( CC, MC ); \
+    CD = v128_sub32( CD, MD ); \
+    CE = v128_sub32( CE, ME ); \
+    CF = v128_sub32( CF, MF ); \
 } while (0)

 #define XOR_W \
 do { \
-   A0 = _mm_xor_si128( A0, v128_32( Wlow ) ); \
-   A1 = _mm_xor_si128( A1, v128_32( Whigh ) ); \
+   A0 = v128_xor( A0, v128_32( Wlow ) ); \
+   A1 = v128_xor( A1, v128_32( Whigh ) ); \
 } while (0)

-#define mm128_swap256_128( v1, v2 ) \
-   v1 = _mm_xor_si128( v1, v2 ); \
-   v2 = _mm_xor_si128( v1, v2 ); \
-   v1 = _mm_xor_si128( v1, v2 );
+#define v128_swap256_128( v1, v2 ) \
+   v1 = v128_xor( v1, v2 ); \
+   v2 = v128_xor( v1, v2 ); \
+   v1 = v128_xor( v1, v2 );

 #define SWAP_BC \
 do { \
-    mm128_swap256_128( B0, C0 ); \
-    mm128_swap256_128( B1, C1 ); \
-    mm128_swap256_128( B2, C2 ); \
-    mm128_swap256_128( B3, C3 ); \
-    mm128_swap256_128( B4, C4 ); \
-    mm128_swap256_128( B5, C5 ); \
-    mm128_swap256_128( B6, C6 ); \
-    mm128_swap256_128( B7, C7 ); \
-    mm128_swap256_128( B8, C8 ); \
-    mm128_swap256_128( B9, C9 ); \
-    mm128_swap256_128( BA, CA ); \
-    mm128_swap256_128( BB, CB ); \
-    mm128_swap256_128( BC, CC ); \
-    mm128_swap256_128( BD, CD ); \
-    mm128_swap256_128( BE, CE ); \
-    mm128_swap256_128( BF, CF ); \
+    v128_swap256_128( B0, C0 ); \
+    v128_swap256_128( B1, C1 ); \
+    v128_swap256_128( B2, C2 ); \
+    v128_swap256_128( B3, C3 ); \
+    v128_swap256_128( B4, C4 ); \
+    v128_swap256_128( B5, C5 ); \
+    v128_swap256_128( B6, C6 ); \
+    v128_swap256_128( B7, C7 ); \
+    v128_swap256_128( B8, C8 ); \
+    v128_swap256_128( B9, C9 ); \
+    v128_swap256_128( BA, CA ); \
+    v128_swap256_128( BB, CB ); \
+    v128_swap256_128( BC, CC ); \
+    v128_swap256_128( BD, CD ); \
+    v128_swap256_128( BE, CE ); \
+    v128_swap256_128( BF, CF ); \
 } while (0)

 #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm128_xor3( xm, xb1, mm128_xorandnot( \
-           _mm_mullo_epi32( mm128_xor3( xa0, xc, \
-              _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+   xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
+           v128_mullo32( v128_xor3( xa0, xc, \
+              v128_mullo32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
           xb3, xb2 ) ); \
-   xb0 = mm128_xnor( xa0, mm128_rol_32( xb0, 1 ) ); \
+   xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
 } while (0)

-/*
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
-do { \
-   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
-            _mm_andnot_si128( xb3, xb2 ), \
-            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), FIVE ) \
-                   ) ), THREE ) ) ) ); \
-   xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
-} while (0)
-*/
-
 #define PERM_STEP_0   do { \
 		PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
@@ -1578,61 +1566,61 @@ do { \

 #define APPLY_P \
 do { \
-    B0 = mm128_ror_32( B0, 15 ); \
-    B1 = mm128_ror_32( B1, 15 ); \
-    B2 = mm128_ror_32( B2, 15 ); \
-    B3 = mm128_ror_32( B3, 15 ); \
-    B4 = mm128_ror_32( B4, 15 ); \
-    B5 = mm128_ror_32( B5, 15 ); \
-    B6 = mm128_ror_32( B6, 15 ); \
-    B7 = mm128_ror_32( B7, 15 ); \
-    B8 = mm128_ror_32( B8, 15 ); \
-    B9 = mm128_ror_32( B9, 15 ); \
-    BA = mm128_ror_32( BA, 15 ); \
-    BB = mm128_ror_32( BB, 15 ); \
-    BC = mm128_ror_32( BC, 15 ); \
-    BD = mm128_ror_32( BD, 15 ); \
-    BE = mm128_ror_32( BE, 15 ); \
-    BF = mm128_ror_32( BF, 15 ); \
+    B0 = v128_ror32( B0, 15 ); \
+    B1 = v128_ror32( B1, 15 ); \
+    B2 = v128_ror32( B2, 15 ); \
+    B3 = v128_ror32( B3, 15 ); \
+    B4 = v128_ror32( B4, 15 ); \
+    B5 = v128_ror32( B5, 15 ); \
+    B6 = v128_ror32( B6, 15 ); \
+    B7 = v128_ror32( B7, 15 ); \
+    B8 = v128_ror32( B8, 15 ); \
+    B9 = v128_ror32( B9, 15 ); \
+    BA = v128_ror32( BA, 15 ); \
+    BB = v128_ror32( BB, 15 ); \
+    BC = v128_ror32( BC, 15 ); \
+    BD = v128_ror32( BD, 15 ); \
+    BE = v128_ror32( BE, 15 ); \
+    BF = v128_ror32( BF, 15 ); \
    PERM_STEP_0; \
    PERM_STEP_1; \
    PERM_STEP_2; \
-    AB = _mm_add_epi32( AB, C6 ); \
-    AA = _mm_add_epi32( AA, C5 ); \
-    A9 = _mm_add_epi32( A9, C4 ); \
-    A8 = _mm_add_epi32( A8, C3 ); \
-    A7 = _mm_add_epi32( A7, C2 ); \
-    A6 = _mm_add_epi32( A6, C1 ); \
-    A5 = _mm_add_epi32( A5, C0 ); \
-    A4 = _mm_add_epi32( A4, CF ); \
-    A3 = _mm_add_epi32( A3, CE ); \
-    A2 = _mm_add_epi32( A2, CD ); \
-    A1 = _mm_add_epi32( A1, CC ); \
-    A0 = _mm_add_epi32( A0, CB ); \
-    AB = _mm_add_epi32( AB, CA ); \
-    AA = _mm_add_epi32( AA, C9 ); \
-    A9 = _mm_add_epi32( A9, C8 ); \
-    A8 = _mm_add_epi32( A8, C7 ); \
-    A7 = _mm_add_epi32( A7, C6 ); \
-    A6 = _mm_add_epi32( A6, C5 ); \
-    A5 = _mm_add_epi32( A5, C4 ); \
-    A4 = _mm_add_epi32( A4, C3 ); \
-    A3 = _mm_add_epi32( A3, C2 ); \
-    A2 = _mm_add_epi32( A2, C1 ); \
-    A1 = _mm_add_epi32( A1, C0 ); \
-    A0 = _mm_add_epi32( A0, CF ); \
-    AB = _mm_add_epi32( AB, CE ); \
-    AA = _mm_add_epi32( AA, CD ); \
-    A9 = _mm_add_epi32( A9, CC ); \
-    A8 = _mm_add_epi32( A8, CB ); \
-    A7 = _mm_add_epi32( A7, CA ); \
-    A6 = _mm_add_epi32( A6, C9 ); \
-    A5 = _mm_add_epi32( A5, C8 ); \
-    A4 = _mm_add_epi32( A4, C7 ); \
-    A3 = _mm_add_epi32( A3, C6 ); \
-    A2 = _mm_add_epi32( A2, C5 ); \
-    A1 = _mm_add_epi32( A1, C4 ); \
-    A0 = _mm_add_epi32( A0, C3 ); \
+    AB = v128_add32( AB, C6 ); \
+    AA = v128_add32( AA, C5 ); \
+    A9 = v128_add32( A9, C4 ); \
+    A8 = v128_add32( A8, C3 ); \
+    A7 = v128_add32( A7, C2 ); \
+    A6 = v128_add32( A6, C1 ); \
+    A5 = v128_add32( A5, C0 ); \
+    A4 = v128_add32( A4, CF ); \
+    A3 = v128_add32( A3, CE ); \
+    A2 = v128_add32( A2, CD ); \
+    A1 = v128_add32( A1, CC ); \
+    A0 = v128_add32( A0, CB ); \
+    AB = v128_add32( AB, CA ); \
+    AA = v128_add32( AA, C9 ); \
+    A9 = v128_add32( A9, C8 ); \
+    A8 = v128_add32( A8, C7 ); \
+    A7 = v128_add32( A7, C6 ); \
+    A6 = v128_add32( A6, C5 ); \
+    A5 = v128_add32( A5, C4 ); \
+    A4 = v128_add32( A4, C3 ); \
+    A3 = v128_add32( A3, C2 ); \
+    A2 = v128_add32( A2, C1 ); \
+    A1 = v128_add32( A1, C0 ); \
+    A0 = v128_add32( A0, CF ); \
+    AB = v128_add32( AB, CE ); \
+    AA = v128_add32( AA, CD ); \
+    A9 = v128_add32( A9, CC ); \
+    A8 = v128_add32( A8, CB ); \
+    A7 = v128_add32( A7, CA ); \
+    A6 = v128_add32( A6, C9 ); \
+    A5 = v128_add32( A5, C8 ); \
+    A4 = v128_add32( A4, C7 ); \
+    A3 = v128_add32( A3, C6 ); \
+    A2 = v128_add32( A2, C5 ); \
+    A1 = v128_add32( A1, C4 ); \
+    A0 = v128_add32( A0, C3 ); \
 } while (0)

 #define INCR_W   do { \
@@ -1798,8 +1786,8 @@ static void
 shabal_4way_core( void *cc, const unsigned char *data, size_t len )
 {
   shabal_4way_context *sc = (shabal_4way_context*)cc;
-    __m128i *buf;
-    __m128i *vdata = (__m128i*)data;
+    v128_t *buf;
+    v128_t *vdata = (v128_t*)data;
   const int buf_size = 64;  
   size_t ptr;
   DECL_STATE
@@ -1809,7 +1797,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )

   if ( len < (buf_size - ptr ) )
   {
-      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      v128_memcpy( buf + (ptr>>2), vdata, len>>2 );
      ptr += len;
      sc->ptr = ptr;
      return;
@@ -1824,7 +1812,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
-      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+      v128_memcpy( buf + (ptr>>2), vdata, clen>>2 );

      ptr += clen;
      vdata += clen>>2;
@@ -1850,7 +1838,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
   shabal_4way_context *sc = (shabal_4way_context*)cc;
-    __m128i *buf;
+    v128_t *buf;
   const int buf_size = 64;
   size_t ptr;
   int i;
@@ -1862,7 +1850,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
   z = 0x80 >> n;
   zz = ((ub & -z) | z) & 0xFF;
   buf[ptr>>2] = v128_32( zz );
-   memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   v128_memset_zero( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
   READ_STATE(sc);
   DECODE_BLOCK;
   INPUT_BLOCK_ADD;
@@ -1876,7 +1864,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
      APPLY_P;
   }

-   __m128i *d = (__m128i*)dst;
+   v128_t *d = (v128_t*)dst;
   if ( size_words == 16 )   // 512
   {
      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -1,7 +1,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1

-#ifdef __SSE4_1__
+#if defined(__SSE4_1__) || defined(__ARM_NEON)

 #include <stddef.h>
 #include "simd-utils.h"
@@ -65,8 +65,8 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
 #endif

 typedef struct {
-	__m128i buf[16] __attribute__ ((aligned (64)));
-	__m128i A[12], B[16], C[16];
+	v128_t buf[16] __attribute__ ((aligned (64)));
+	v128_t A[12], B[16], C[16];
 	uint32_t Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
--- a/algo/shavite/shavite-hash.h
+++ b/algo/shavite/shavite-hash.h
@@ -0,0 +1,315 @@
+/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */
+/**
+ * SHAvite-3 interface. This code implements SHAvite-3 with the
+ * recommended parameters for SHA-3, with outputs of 224, 256, 384 and
+ * 512 bits. In the following, we call the function "SHAvite" (without
+ * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit
+ * output".
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shavite.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHAVITE_H__
+#define SPH_SHAVITE_H__
+
+#include <stddef.h>
+#include "compat/sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for SHAvite-224.
+ */
+#define SPH_SIZE_shavite224   224
+
+/**
+ * Output size (in bits) for SHAvite-256.
+ */
+#define SPH_SIZE_shavite256   256
+
+/**
+ * Output size (in bits) for SHAvite-384.
+ */
+#define SPH_SIZE_shavite384   384
+
+/**
+ * Output size (in bits) for SHAvite-512.
+ */
+#define SPH_SIZE_shavite512   512
+
+/**
+ * This structure is a context for SHAvite-224 and SHAvite-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64] __attribute__ ((aligned (64))); 
+        sph_u32 h[8] __attribute__ ((aligned (32)));
+	size_t ptr;
+	sph_u32 count0, count1;
+#endif
+} sph_shavite_small_context;
+
+/**
+ * This structure is a context for SHAvite-224 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite224_context;
+
+/**
+ * This structure is a context for SHAvite-256 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite256_context;
+
+/**
+ * This structure is a context for SHAvite-384 and SHAvite-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128] __attribute__ ((aligned (64))); 
+        sph_u32 h[16] __attribute__ ((aligned (32)));;
+	size_t ptr;
+	sph_u32 count0, count1, count2, count3;
+#endif
+} sph_shavite_big_context;
+
+/**
+ * This structure is a context for SHAvite-384 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite384_context;
+
+/**
+ * This structure is a context for SHAvite-512 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite512_context;
+
+/**
+ * Initialize a SHAvite-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-224 context (pointer to a
+ *             <code>sph_shavite224_context</code>)
+ */
+void sph_shavite224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-256 context (pointer to a
+ *             <code>sph_shavite256_context</code>)
+ */
+void sph_shavite256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-384 context (pointer to a
+ *             <code>sph_shavite384_context</code>)
+ */
+void sph_shavite384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+//Don't call these directly from application code, use the macros below.
+#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
+
+void sph_shavite512_aesni_init(void *cc);
+void sph_shavite512_aesni(void *cc, const void *data, size_t len);
+void sph_shavite512_aesni_close(void *cc, void *dst);
+void sph_shavite512_aesni_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#define sph_shavite512_init  sph_shavite512_aesni_init
+#define sph_shavite512       sph_shavite512_aesni
+#define sph_shavite512_close sph_shavite512_aesni_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_aesni_addbits_and_close
+
+#else
+
+void sph_shavite512_sw_init(void *cc);
+void sph_shavite512_sw(void *cc, const void *data, size_t len);
+void sph_shavite512_sw_close(void *cc, void *dst);
+void sph_shavite512_sw_addbits_and_close(
+   void *cc, unsigned ub, unsigned n, void *dst);
+
+
+#define sph_shavite512_init  sph_shavite512_sw_init
+#define sph_shavite512       sph_shavite512_sw
+#define sph_shavite512_close sph_shavite512_sw_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_sw_addbits_and_close
+
+#endif
+
+// Use these macros from application code.
+#define shavite512_context sph_shavite512_context
+
+#define shavite512_init   sph_shavite512_init
+#define shavite512_update sph_shavite512
+#define shavite512_close  sph_shavite512_close
+
+#define shavite512_full( cc, dst, data, len ) \
+do{ \
+   shavite512_init( cc ); \
+   shavite512_update( cc, data, len ); \
+   shavite512_close( cc, dst ); \
+}while(0)
+
+#ifdef __cplusplus
+}
+#endif	
+	
+#endif
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -33,7 +33,9 @@
 #include <stddef.h>
 #include <string.h>

-#if defined(__AES__)
+#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
+
+#pragma message "AES for shavite"

 #include "sph_shavite.h"
 #include "simd-utils.h"
@@ -50,24 +52,21 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

-#define C32   SPH_C32
-
 static const sph_u32 IV512[] = {
-	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
-	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
-	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
-	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+	0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
+	0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
+	0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
+	0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };

-
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
 {
-   const __m128i zero = _mm_setzero_si128();
-   __m128i p0, p1, p2, p3, x;
-   __m128i k00, k01, k02, k03, k10, k11, k12, k13;
-   __m128i *m = (__m128i*)msg;
-   __m128i *h = (__m128i*)sc->h;
+   const v128_t zero = v128_zero;
+   v128_t p0, p1, p2, p3, x;
+   v128_t k00, k01, k02, k03, k10, k11, k12, k13;
+   v128_t *m = (v128_t*)msg;
+   v128_t *h = (v128_t*)sc->h;
   int r;

   p0 = h[0];
@@ -78,242 +77,242 @@ c512( sph_shavite_big_context *sc, const void *msg )
   // round

   k00 = m[0];
-   x = _mm_xor_si128( p1, k00 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( p1, k00 );
+   x = v128_aesenc( x, zero );

   k01 = m[1];
-   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k01 );
+   x = v128_aesenc( x, zero );
   k02 = m[2];
-   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k02 );
+   x = v128_aesenc( x, zero );
   k03 = m[3];
-   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k03 );
+   x = v128_aesenc( x, zero );

-   p0 = _mm_xor_si128( p0, x );
+   p0 = v128_xor( p0, x );

   k10 = m[4];
-   x = _mm_xor_si128( p3, k10 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( p3, k10 );
+   x = v128_aesenc( x, zero );
   k11 = m[5];
-   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k11 );
+   x = v128_aesenc( x, zero );
   k12 = m[6];
-   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k12 );
+   x = v128_aesenc( x, zero );
   k13 = m[7];
-   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k13 );
+   x = v128_aesenc( x, zero );

-   p2 = _mm_xor_si128( p2, x );
+   p2 = v128_xor( p2, x );

   for ( r = 0; r < 3; r ++ )
   {
      // round 1, 5, 9
-      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
-      k00 = _mm_xor_si128( k00, k13 ); 
+      k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+      k00 = v128_xor( k00, k13 ); 

      if ( r == 0 )
-         k00 = _mm_xor_si128( k00, _mm_set_epi32(
+         k00 = v128_xor( k00, v128_set32(
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 

-      x = _mm_xor_si128( p0, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
-      k01 = _mm_xor_si128( k01, k00 );
+      x = v128_xor( p0, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
+      k01 = v128_xor( k01, k00 );

      if ( r == 1 )
-         k01 = _mm_xor_si128( k01, _mm_set_epi32(
+         k01 = v128_xor( k01, v128_set32(
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );

-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
-      k02 = _mm_xor_si128( k02, k01 );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
-      k03 = _mm_xor_si128( k03, k02 );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+      k02 = v128_xor( k02, k01 );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+      k03 = v128_xor( k03, k02 );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );

-      p3 = _mm_xor_si128( p3, x );
+      p3 = v128_xor( p3, x );

-      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
-      k10 = _mm_xor_si128( k10, k03 );
+      k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+      k10 = v128_xor( k10, k03 );

-      x = _mm_xor_si128( p2, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
-      k11 = _mm_xor_si128( k11, k10 );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
-      k12 = _mm_xor_si128( k12, k11 );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
-      k13 = _mm_xor_si128( k13, k12 );
+      x = v128_xor( p2, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+      k11 = v128_xor( k11, k10 );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+      k12 = v128_xor( k12, k11 );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+      k13 = v128_xor( k13, k12 );

      if ( r == 2 )
-         k13 = _mm_xor_si128( k13, _mm_set_epi32(
+         k13 = v128_xor( k13, v128_set32(
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );

-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
-      p1 = _mm_xor_si128( p1, x );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );
+      p1 = v128_xor( p1, x );

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
-      x = _mm_xor_si128( p3, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
+      x = v128_xor( p3, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );

-      p2 = _mm_xor_si128( p2, x );
+      p2 = v128_xor( p2, x );

-      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
-      x = _mm_xor_si128( p1, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
+      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
+      x = v128_xor( p1, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );

-      p0 = _mm_xor_si128( p0, x );
+      p0 = v128_xor( p0, x );

      // round 3, 7, 11

-      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
-      k00 = _mm_xor_si128( k00, k13 );
-      x = _mm_xor_si128( p2, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
-      k01 = _mm_xor_si128( k01, k00 );
-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
-      k02 = _mm_xor_si128( k02, k01 );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
-      k03 = _mm_xor_si128( k03, k02 );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+      k00 = v128_xor( k00, k13 );
+      x = v128_xor( p2, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
+      k01 = v128_xor( k01, k00 );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+      k02 = v128_xor( k02, k01 );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+      k03 = v128_xor( k03, k02 );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );

-      p1 = _mm_xor_si128( p1, x );
+      p1 = v128_xor( p1, x );

-      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
-      k10 = _mm_xor_si128( k10, k03 );
-      x = _mm_xor_si128( p0, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
-      k11 = _mm_xor_si128( k11, k10 );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
-      k12 = _mm_xor_si128( k12, k11 );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
-      k13 = _mm_xor_si128( k13, k12 );
-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
+      k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+      k10 = v128_xor( k10, k03 );
+      x = v128_xor( p0, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+      k11 = v128_xor( k11, k10 );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+      k12 = v128_xor( k12, k11 );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+      k13 = v128_xor( k13, k12 );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );

-      p3 = _mm_xor_si128( p3, x );
+      p3 = v128_xor( p3, x );

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
-      x = _mm_xor_si128( p1, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
+      x = v128_xor( p1, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );

-      p0 = _mm_xor_si128( p0, x );
+      p0 = v128_xor( p0, x );

-      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
-      x = _mm_xor_si128( p3, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
+      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
+      x = v128_xor( p3, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );

-      p2 = _mm_xor_si128( p2, x );
+      p2 = v128_xor( p2, x );
   }

   // round 13

-   k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
-   k00 = _mm_xor_si128( k00, k13 );
-   x = _mm_xor_si128( p0, k00 );
-   x = _mm_aesenc_si128( x, zero );
-   k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); 
-   k01 = _mm_xor_si128( k01, k00 );
-   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, zero );
-   k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
-   k02 = _mm_xor_si128( k02, k01 );
-   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, zero );
-   k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
-   k03 = _mm_xor_si128( k03, k02 );
-   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, zero );
+   k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+   k00 = v128_xor( k00, k13 );
+   x = v128_xor( p0, k00 );
+   x = v128_aesenc( x, zero );
+   k01 = v128_shuflr32( v128_aesenc( k01, zero ) ); 
+   k01 = v128_xor( k01, k00 );
+   x = v128_xor( x, k01 );
+   x = v128_aesenc( x, zero );
+   k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+   k02 = v128_xor( k02, k01 );
+   x = v128_xor( x, k02 );
+   x = v128_aesenc( x, zero );
+   k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+   k03 = v128_xor( k03, k02 );
+   x = v128_xor( x, k03 );
+   x = v128_aesenc( x, zero );

-   p3 = _mm_xor_si128( p3, x );
+   p3 = v128_xor( p3, x );

-   k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
-   k10 = _mm_xor_si128( k10, k03 );
-   x = _mm_xor_si128( p2, k10 );
-   x = _mm_aesenc_si128( x, zero );
-   k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
-   k11 = _mm_xor_si128( k11, k10 );
-   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, zero );
-   k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
-   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
+   k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+   k10 = v128_xor( k10, k03 );
+   x = v128_xor( p2, k10 );
+   x = v128_aesenc( x, zero );
+   k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+   k11 = v128_xor( k11, k10 );
+   x = v128_xor( x, k11 );
+   x = v128_aesenc( x, zero );
+   k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+   k12 = v128_xor( k12, v128_xor( k11, v128_set32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
-   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, zero );
-   k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
-   k13 = _mm_xor_si128( k13, k12 );
-   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k12 );
+   x = v128_aesenc( x, zero );
+   k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+   k13 = v128_xor( k13, k12 );
+   x = v128_xor( x, k13 );
+   x = v128_aesenc( x, zero );

-   p1 = _mm_xor_si128( p1, x );
+   p1 = v128_xor( p1, x );

-   h[0] = _mm_xor_si128( h[0], p2 );
-   h[1] = _mm_xor_si128( h[1], p3 );
-   h[2] = _mm_xor_si128( h[2], p0 );
-   h[3] = _mm_xor_si128( h[3], p1 );
+   h[0] = v128_xor( h[0], p2 );
+   h[1] = v128_xor( h[1], p3 );
+   h[2] = v128_xor( h[2], p0 );
+   h[3] = v128_xor( h[3], p1 );
 }


--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

 //Don't call these directly from application code, use the macros below.
-#if defined(__AES__) && defined(__SSSE3__)
+#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

 void sph_shavite512_aesni_init(void *cc);
 void sph_shavite512_aesni(void *cc, const void *data, size_t len);
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -4,6 +4,9 @@
 #include "nist.h"
 #include "vector.h"

+
+#if defined(__SSE2__)
+
 #define PRINT_SOME 0

 int SupportedLength(int hashbitlen) {
@@ -938,3 +941,5 @@ void fft128_natural(fft_t *x, unsigned char *a) {
    x[2*i+1] = y[i+64];
  }
 }
+
+#endif // SSE2
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -3,14 +3,10 @@

 #include "compat.h"

-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-
 /******************************* 
 * Using GCC vector extensions * 
 *******************************/

-#if   defined(__SSE2__)
-
 //typedef unsigned char v16qi __attribute__ ((vector_size (16)));
 typedef char          v16qi __attribute__ ((vector_size (16)));
 typedef short          v8hi __attribute__ ((vector_size (16)));
@@ -65,6 +61,10 @@ union u32 {
 #define v32_andn(x,y) ((v32) vec_andn((x), (y)))
 #endif

+//TODO  aarch support for widening multiply
+
+#if defined(__SSE2__)
+
 #define vec_and(x,y) ((x)&(y))
 #define vec_or(x,y)  ((x)|(y))
 #define vec_xor(x,y) ((x)^(y))
@@ -127,72 +127,11 @@ union u32 {

 #define CV(x) {{x, x, x, x, x, x, x, x}}

-#elif defined(__ALTIVEC__)
-
-#include <altivec.h>
-
-typedef vector unsigned char  v8;
-typedef vector signed   short v16;
-typedef vector unsigned int   v32;
-
-#define V3216(x) ((v16) (x))
-#define V1632(x) ((v32) (x))
-#define  V168(x) ( (v8) (x))
-#define  V816(x) ((v16) (x))
-
-#define V16_SIZE 8
-#define print_vec print_sse
-
-#define MAKE_VECT(x, ...) {{x, __VA_ARGS__}}
-
-#define CV(x) MAKE_VECT(x, x, x, x, x, x, x, x)
-#define CV16(x)  ((vector   signed short) {x,x,x,x,x,x,x,x})
-#define CVU16(x) ((vector unsigned short) {x,x,x,x,x,x,x,x})
-#define CV32(x)  ((vector unsigned int  ) {x,x,x,x})
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-union ucv {
-  unsigned short u16[8];
-  vector unsigned char v16;
-};
-
-// Nasty hack to avoid macro expansion madness
-
-
-/* altivec.h is broken with Gcc 3.3 is C99 mode  */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#define typeof __typeof
-#endif
-
-MAYBE_INLINE v16 vec_and_fun (v16 x, v16 y) {
-  return vec_and (x, y);
-}
-
-MAYBE_INLINE v16 vec_or_fun (v16 x, v16 y) {
-  return vec_or (x, y);
-}
-
-MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
-  return vec_xor (x, y);
-}
-
-#undef vec_and
-#undef vec_or
-#undef vec_xor
-
-#define vec_and(x,y) ((__typeof(x)) vec_and_fun((v16) x, (v16) y))
-#define vec_or(x,y)  ((__typeof(x)) vec_or_fun((v16) x, (v16) y))
-#define vec_xor(x,y) ((__typeof(x)) vec_xor_fun((v16) x, (v16) y))
+#elif defined(__aarch64__) && defined(__ARM_NEON)

+#define vec_and( x, y )    v128_and( x, y )
+#define vec_or(x,y)        v128_or( x, y )
+#define vec_xor(x,y)       v128_xor( x, y )

 #define v16_and vec_and
 #define v16_or  vec_or
@@ -202,128 +141,36 @@ MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
 #define v32_or  vec_or
 #define v32_xor vec_xor

+#define vec_andn( x,y )   v128_andnot( x, y )
+#define v16_andn          vec_andn 
+#define v32_andn          vec_andn

-#define v32_add vec_add
+#define v32_add( x, y )   v128_add32( x, y )

-#define v16_add vec_add
-#define v16_sub vec_sub
-#define v16_mul(a,b) vec_mladd(a,b,CV16(0))
+#define v16_add( x, y )        v128_add16( x, y )
+#define v16_sub( x, y )        v128_sub16( x, y )
+#define v16_mul( x, y )        v128_mul16( x, y )
+#define v16_neg(x)             v128_negate16( x )
+#define v16_shift_l( x, c )    v128_sl16
+#define v16_shift_r            v128_sr16
+#define v16_cmp                v128_cmpgt16

-vector unsigned   short ZZ = {0,0,0,0,0,0,0,0};
+#define v16_interleavel        v128_unpacklo16
+#define v16_interleaveh        v128_unpackhi16 

-v16 v16_shift_l(v16 x,int s) {
-  vector unsigned short shift = {s,s,s,s,s,s,s,s};
-  v16 y = vec_sl (x, shift);
-  return y;
-}
-#define v16_shift_l(x,s)  vec_sl (x,CVU16(s))
-#define v16_shift_r(x,s)  vec_sra(x,CVU16(s))
-#define v16_cmp      vec_cmpgt
+// the builtins compile for arm, so ???
+#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
+#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))

-#define v16_mergel(a,b)   V1632(vec_mergeh(b,a))
-#define v16_mergeh(a,b)   V1632(vec_mergel(b,a))
+#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
+#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))

-#define v16_interleavel(a,b)   vec_mergeh(a,b)
-#define v16_interleaveh(a,b)   vec_mergel(a,b)
+#define v32_shift_l            v128_sl32
+#define v32_shift_r            v128_sr32

-#define v8_mergel(a,b) V816(vec_mergeh(b,a))
-#define v8_mergeh(a,b) V816(vec_mergel(b,a))
+#define v32_rotate(x,n)        v128_rol32

-#define v32_rotate(x,s)  vec_rl(x,CV32(s))
-
-// #define v32_unpckl   vec_mergel
-// #define v32_unpckh   vec_mergeh
-
-#define vector_shuffle(x,s) vec_perm(x,x,s)
-
-static const v8 SHUFXOR_1 = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
-static const v8 SHUFXOR_2 = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7};
-static const v8 SHUFXOR_3 = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
-
-#define v32_shufxor(x,s) vector_shuffle(x,SHUFXOR_##s)
-
-//static const v8 SHUFSWAP = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
-static const v8 SHUFSWAP = {3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12};
-
-#define v32_bswap(x) vector_shuffle(x,SHUFSWAP)
-
-#else
-
-#error "I don't know how to vectorize on this architecture."
-
-#endif
-
-#else
-
-/******************************** 
- * Using MSVC/ICC vector instrinsics * 
- ********************************/
-
-#include <emmintrin.h>
-
-typedef __m128i  v8;
-typedef __m128i v16;
-typedef __m128i v32;
-
-#define V3216(x) (x)
-#define V1632(x) (x)
-#define  V168(x) (x)
-#define  V816(x) (x)
-
-#define V16_SIZE 8
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-#define CV(x) {{x, x, x, x, x, x, x, x}}
-
-#define vec_and      _mm_and_si128
-#define vec_or       _mm_or_si128
-#define vec_xor      _mm_xor_si128
-
-#define v16_and vec_and
-#define v16_or  vec_or
-#define v16_xor vec_xor
-
-#define v32_and vec_and
-#define v32_or  vec_or
-#define v32_xor vec_xor
-
-#define vector_shuffle(x,s) _mm_shuffle_epi8(x, s)
-
-#define v32_add      _mm_add_epi32
-
-#define v16_add      _mm_add_epi16
-#define v16_sub      _mm_sub_epi16
-#define v16_mul      _mm_mullo_epi16
-#define v16_neg(x)   (-(x))
-#define v16_shift_l  _mm_slli_epi16
-#define v16_shift_r  _mm_srai_epi16
-#define v16_cmp      _mm_cmpgt_epi16
-
-#define v16_interleavel   _mm_unpacklo_epi16
-#define v16_interleaveh   _mm_unpackhi_epi16
-
-#define v16_mergel   _mm_unpacklo_epi16
-#define v16_mergeh   _mm_unpackhi_epi16
-
-#define v8_mergel    _mm_unpacklo_epi8
-#define v8_mergeh    _mm_unpackhi_epi8
-
-#define v32_shift_l  _mm_slli_epi32
-#define v32_shift_r  _mm_srli_epi32
-
-#define v32_rotate(x,n)                                 \
-  vec_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
-
-#define v32_shuf     _mm_shuffle_epi32
+#define v32_shuf __builtin_ia32_pshufd

 #define SHUFXOR_1 0xb1          /* 0b10110001 */
 #define SHUFXOR_2 0x4e          /* 0b01001110 */
@@ -332,13 +179,25 @@ union cv8 {
 #define CAT(x, y) x##y
 #define XCAT(x,y) CAT(x,y)

-//#define v32_shufxor(x,s) v32_shuf(x,SHUFXOR_##s)
 #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))

 #define v32_bswap(x) (x)

+#define v16_broadcast(x) ({                     \
+      union u32 u;                              \
+      u32 xx = x;                               \
+      u.u[0] = xx | (xx << 16);                 \
+      V3216(v32_shuf(u.v,0)); })
+
+#define CV(x) {{x, x, x, x, x, x, x, x}}
+
+#else
+
+#error "I don't know how to vectorize on this architecture."
+
 #endif

+
 /* Twiddle tables */

  static const union cv FFT64_Twiddle[] = {
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -65,8 +65,8 @@ extern "C" {
 #endif

 typedef struct {
-   __m128i block[16] __attribute__ ((aligned (64)));
-   __m128i digest[8];
+   v128_t block[16] __attribute__ ((aligned (64)));
+   v128_t digest[8];
   uint32_t nblocks;
   uint32_t num;
 } sm3_4way_ctx_t;
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -714,42 +714,42 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )

   #undef Q_REDUCE

-#elif defined(__SSE4_1__)
+#elif defined(__SSE4_1__) || defined(__ARM_NEON)

-   __m128i F[16] __attribute__ ((aligned (64)));
-   __m128i *mul = (__m128i*)multipliers;
-   __m128i *out = (__m128i*)output;
-   __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
+   v128_t F[16] __attribute__ ((aligned (64)));
+   v128_t *mul = (v128_t*)multipliers;
+   v128_t *out = (v128_t*)output;
+   v128_t *tbl = (v128_t*)&( fftTable[ input[0] << 3 ] );

-   F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
-   F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
-   F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
-   F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
-   F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
-   F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
-   F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
-   F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
-   F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
-   F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
-   F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
-   F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
-   F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
-   F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
-   F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
-   F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
+   F[ 0] = v128_mullo32( mul[ 0], tbl[0] );
+   F[ 1] = v128_mullo32( mul[ 1], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[1] << 3 ] );
+   F[ 2] = v128_mullo32( mul[ 2], tbl[0] );
+   F[ 3] = v128_mullo32( mul[ 3], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[2] << 3 ] );
+   F[ 4] = v128_mullo32( mul[ 4], tbl[0] );
+   F[ 5] = v128_mullo32( mul[ 5], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[3] << 3 ] );
+   F[ 6] = v128_mullo32( mul[ 6], tbl[0] );
+   F[ 7] = v128_mullo32( mul[ 7], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[4] << 3 ] );
+   F[ 8] = v128_mullo32( mul[ 8], tbl[0] );
+   F[ 9] = v128_mullo32( mul[ 9], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[5] << 3 ] );
+   F[10] = v128_mullo32( mul[10], tbl[0] );
+   F[11] = v128_mullo32( mul[11], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[6] << 3 ] );
+   F[12] = v128_mullo32( mul[12], tbl[0] );
+   F[13] = v128_mullo32( mul[13], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[7] << 3 ] );
+   F[14] = v128_mullo32( mul[14], tbl[0] );
+   F[15] = v128_mullo32( mul[15], tbl[1] );

   #define ADD_SUB( a, b ) \
   { \
-      __m128i tmp = b; \
-      b = _mm_sub_epi32( a, b ); \
-      a = _mm_add_epi32( a, tmp ); \
+      v128_t tmp = b; \
+      b = v128_sub32( a, b ); \
+      a = v128_add32( a, tmp ); \
   }

   ADD_SUB( F[ 0], F[ 2] );
@@ -760,10 +760,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
   ADD_SUB( F[ 9], F[11] );
   ADD_SUB( F[12], F[14] );
   ADD_SUB( F[13], F[15] );
-   F[ 6] = _mm_slli_epi32( F[ 6], 4 );
-   F[ 7] = _mm_slli_epi32( F[ 7], 4 );
-   F[14] = _mm_slli_epi32( F[14], 4 );
-   F[15] = _mm_slli_epi32( F[15], 4 );
+   F[ 6] = v128_sl32( F[ 6], 4 );
+   F[ 7] = v128_sl32( F[ 7], 4 );
+   F[14] = v128_sl32( F[14], 4 );
+   F[15] = v128_sl32( F[15], 4 );
   ADD_SUB( F[ 0], F[ 4] );
   ADD_SUB( F[ 1], F[ 5] );
   ADD_SUB( F[ 2], F[ 6] );
@@ -772,12 +772,12 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
   ADD_SUB( F[ 9], F[13] );
   ADD_SUB( F[10], F[14] );
   ADD_SUB( F[11], F[15] );
-   F[10] = _mm_slli_epi32( F[10], 2 );
-   F[11] = _mm_slli_epi32( F[11], 2 );
-   F[12] = _mm_slli_epi32( F[12], 4 );
-   F[13] = _mm_slli_epi32( F[13], 4 );
-   F[14] = _mm_slli_epi32( F[14], 6 );
-   F[15] = _mm_slli_epi32( F[15], 6 );
+   F[10] = v128_sl32( F[10], 2 );
+   F[11] = v128_sl32( F[11], 2 );
+   F[12] = v128_sl32( F[12], 4 );
+   F[13] = v128_sl32( F[13], 4 );
+   F[14] = v128_sl32( F[14], 6 );
+   F[15] = v128_sl32( F[15], 6 );
   ADD_SUB( F[ 0], F[ 8] );
   ADD_SUB( F[ 1], F[ 9] );
   ADD_SUB( F[ 2], F[10] );
@@ -789,10 +789,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )

   #undef ADD_SUB

-   const __m128i mask = _mm_set1_epi32( 0x000000ff );
+   const v128_t mask = v128_32( 0x000000ff );

   #define Q_REDUCE( a ) \
-      _mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) ) 
+      v128_sub32( v128_and( a, mask ), v128_sra32( a, 8 ) ) 

   out[ 0] = Q_REDUCE( F[ 0] );
   out[ 1] = Q_REDUCE( F[ 1] );
@@ -1261,14 +1261,14 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,

 #elif defined(__SSE4_1__)

-   __m128i *res = (__m128i*)result;
+   v128_t *res = (v128_t*)result;
   for ( j = 0; j < N/4; ++j )
   {
-      __m128i sum = _mm_setzero_si128();
-      const __m128i *f = (__m128i*)input + j;
-      const __m128i *k = (__m128i*)a + j;
+      v128_t sum = v128_zero;
+      const v128_t *f = (v128_t*)input + j;
+      const v128_t *k = (v128_t*)a + j;
      for ( i = 0; i < m; i++, f += N/4, k += N/4 )
-         sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
+         sum = v128_add32( sum, v128_mullo32( *f, *k ) );
      res[j] = sum;
   }

--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -101,7 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;

-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
   verthash_sha3_512_prehash_72( edata );

   do
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -12,9 +12,13 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
@@ -37,7 +41,11 @@ typedef struct {
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
   sph_skein512_context    skein;
-   hashState_luffa         luffa;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
+        hashState_luffa         luffa;
+#endif
   cubehashParam           cube;
   sph_shavite512_context  shavite;
   hashState_sd            simd;
@@ -59,7 +67,11 @@ void init_c11_ctx()
   sph_skein512_init( &c11_ctx.skein );
   sph_jh512_init( &c11_ctx.jh );
   sph_keccak512_init( &c11_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init( &c11_ctx.luffa );
+#else
   init_luffa( &c11_ctx.luffa, 512 );
+#endif
   cubehashInit( &c11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &c11_ctx.shavite );
   init_sd( &c11_ctx.simd, 512 );
@@ -94,8 +106,13 @@ void c11_hash( void *output, const void *input )
    sph_skein512( &ctx.skein, (const void*) hash, 64 );
    sph_skein512_close( &ctx.skein, hash );

+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                             (const BitSequence*)hash, 64 );
+#endif

     cubehashUpdateDigest( &ctx.cube, (byte*)hash,
                           (const byte*)hash, 64 );
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -144,17 +144,17 @@ void timetravel_4way_hash(void *output, const void *input)
        break;
        case 7:
           dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                      (const byte*)hash0, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash0,
+                                      hash0, dataLen );
           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                      (const byte*)hash1, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash1,
+                                      hash1, dataLen );
           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                      (const byte*)hash2, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash2,
+                                      hash2, dataLen );
           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                      (const byte*)hash3, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash3,
+                                      hash3, dataLen );
           if ( i != 7 )           
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -11,13 +11,17 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
@@ -28,7 +32,11 @@ typedef struct {
        sph_skein512_context    skein;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
        hashState_luffa         luffa;
+#endif
        cubehashParam           cube;
 #ifdef __AES__
        hashState_groestl       groestl;
@@ -47,7 +55,11 @@ void init_tt8_ctx()
        sph_skein512_init( &tt_ctx.skein );
        sph_jh512_init( &tt_ctx.jh );
        sph_keccak512_init( &tt_ctx.keccak );
+#if defined(__aarch64__)
+        sph_luffa512_init( &tt_ctx.luffa );
+#else
        init_luffa( &tt_ctx.luffa, 512 );
+#endif
        cubehashInit( &tt_ctx.cube, 512, 16, 32 );
 #ifdef __AES__
        init_groestl( &tt_ctx.groestl, 64 );
@@ -171,26 +183,37 @@ void timetravel_hash(void *output, const void *input)
     case 6:
        if ( i == 0 )
        {
+#if defined(__aarch64__)
           memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                   (const BitSequence *)input + 64, 16 );
+           sph_luffa512( &ctx.luffa, input + 64, 16 );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else
+           memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
+           update_and_final_luffa( &ctx.luffa, hashB,
+                                   input + 64, 16 );
+#endif
        }
        else
        {
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                   (const BitSequence *)hashA, dataLen );
+#if defined(__aarch64__)
+           sph_luffa512( &ctx.luffa, hashA, dataLen );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else
+           update_and_final_luffa( &ctx.luffa, hashB,
+                                   hashA, dataLen );
+#endif
        }
        break;
     case 7:
        if ( i == 0 )
        {
           memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hashB,
-                                 (const byte*)input + midlen, tail );
+           cubehashUpdateDigest( &ctx.cube, hashB,
+                                 input + midlen, tail );
        }
        else
        {
-           cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA,
+           cubehashUpdateDigest( &ctx.cube, hashB, hashA,
                                 dataLen );
        }
        break;
@@ -264,11 +287,15 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce,
           break;
        case 6:
           memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
-           update_luffa( &tt_mid.luffa, (const BitSequence*)endiandata, 64 );
+#if defined(__aarch64__)
+           sph_luffa512( &tt_mid.luffa, endiandata, 64 );
+#else
+           update_luffa( &tt_mid.luffa, endiandata, 64 );
+#endif
           break;
        case 7:
           memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) );
-           cubehashUpdate( &tt_mid.cube, (const byte*)endiandata, 64 );
+           cubehashUpdate( &tt_mid.cube, endiandata, 64 );
           break;
        default:
           break;
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -151,17 +151,17 @@ void timetravel10_4way_hash(void *output, const void *input)
        case 7:
           dintrlv_4x64( hash0, hash1, hash2, hash3,
                                    vhashA, dataLen<<3 );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                      (const byte*)hash0, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash0,
+                                      hash0, dataLen );
           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                      (const byte*)hash1, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash1,
+                                      hash1, dataLen );
           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                      (const byte*)hash2, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash2,
+                                      hash2, dataLen );
           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                      (const byte*)hash3, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash3,
+                                      hash3, dataLen );
           if ( i != 9 )           
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -11,7 +11,6 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
@@ -20,6 +19,11 @@
 #else
  #include "algo/groestl/sph_groestl.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -30,7 +34,11 @@ typedef struct {
        sph_skein512_context    skein;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
        hashState_luffa         luffa;
+#endif
        cubehashParam           cube;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
@@ -51,7 +59,11 @@ void init_tt10_ctx()
        sph_skein512_init( &tt10_ctx.skein );
        sph_jh512_init( &tt10_ctx.jh );
        sph_keccak512_init( &tt10_ctx.keccak );
+#if defined(__aarch64__)
+        sph_luffa512_init( &tt10_ctx.luffa );
+#else
        init_luffa( &tt10_ctx.luffa, 512 );
+#endif
        cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
        sph_shavite512_init( &tt10_ctx.shavite );
        init_sd( &tt10_ctx.simd, 512 );
@@ -177,14 +189,25 @@ void timetravel10_hash(void *output, const void *input)
     case 6:
        if ( i == 0 )
        {
+#if defined(__aarch64__)
+           memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa );
+           sph_luffa512( &ctx.luffa, input + 64, 16 );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else           
           memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa );
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                                   (const BitSequence *)input + 64, 16 );
+#endif        
        }
        else
        {
+#if defined(__aarch64__)
+           sph_luffa512( &ctx.luffa, hashA, dataLen );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else
           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                                   (const BitSequence *)hashA, dataLen );
+#endif
        }
        break;
     case 7:
@@ -297,7 +320,11 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
           break;
        case 6:
           memcpy( &tt10_mid.luffa, &tt10_ctx.luffa, sizeof(tt10_mid.luffa ) );
+#if defined(__aarch64__)
+           sph_luffa512( &tt10_mid.luffa, endiandata, 64 );
+#else
           update_luffa( &tt10_mid.luffa, (const BitSequence*)endiandata, 64 );
+#endif
           break;
        case 7:
           memcpy( &tt10_mid.cube, &tt10_ctx.cube, sizeof(tt10_mid.cube ) );
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -13,7 +13,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"

@@ -24,6 +23,11 @@
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
   sph_blake512_context blake;
@@ -38,7 +42,11 @@ typedef struct {
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
   sph_skein512_context    skein;
-   hashState_luffa         luffa;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
+        hashState_luffa         luffa;
+#endif
   cubehashParam           cube;
   sph_shavite512_context  shavite;
   hashState_sd            simd;
@@ -60,7 +68,11 @@ void init_x11_ctx()
   sph_skein512_init( &x11_ctx.skein );
   sph_jh512_init( &x11_ctx.jh );
   sph_keccak512_init( &x11_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init( &x11_ctx.luffa );
+#else
   init_luffa( &x11_ctx.luffa, 512 );
+#endif
   cubehashInit( &x11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &x11_ctx.shavite );
   init_sd( &x11_ctx.simd, 512 );
@@ -97,8 +109,13 @@ void x11_hash( void *state, const void *input )
    sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
    sph_keccak512_close( &ctx.keccak, hash );

+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
     update_luffa( &ctx.luffa, (const BitSequence*)hash, 64 );
     final_luffa( &ctx.luffa, (BitSequence*)hash );
+#endif

     cubehashUpdate( &ctx.cube, (const byte*) hash, 64 );
     cubehashDigest( &ctx.cube, (byte*)hash );
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -19,9 +19,13 @@
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
 #ifdef __AES__
@@ -31,7 +35,11 @@ typedef struct {
    sph_groestl512_context  groestl;
    sph_echo512_context     echo;
 #endif
-    hashState_luffa         luffa;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
+        hashState_luffa         luffa;
+#endif
    cubehashParam           cube;
    hashState_sd            simd;
    sph_blake512_context    blake;
@@ -53,7 +61,11 @@ void init_x11evo_ctx()
     sph_groestl512_init( &x11evo_ctx.groestl );
     sph_echo512_init( &x11evo_ctx.echo );
 #endif
+#if defined(__aarch64__)
+     sph_luffa512_init( &x11evo_ctx.luffa );
+#else
     init_luffa( &x11evo_ctx.luffa, 512 );
+#endif
     cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
     init_sd( &x11evo_ctx.simd, 512 );
     sph_blake512_init( &x11evo_ctx.blake );
@@ -124,9 +136,14 @@ void x11evo_hash( void *state, const void *input )
 	      sph_keccak512_close( &ctx.keccak, (char*)hash );
 	      break;
 	    case 6:
+#if defined(__aarch64__)
+              sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+              sph_luffa512_close(&ctx.luffa, hash);
+#else
              update_and_final_luffa( &ctx.luffa, (char*)hash,
                                      (const char*)hash, 64 );
-	      break;
+#endif
+         break;
 	    case 7:
              cubehashUpdateDigest( &ctx.cube, (char*)hash, 
                                    (const char*)hash, 64 );
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -13,7 +13,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"

@@ -24,6 +23,11 @@
  #include "algo/groestl/sph_groestl.h"
  #include "algo/echo/sph_echo.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
   sph_blake512_context blake;
@@ -38,7 +42,11 @@ typedef struct {
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
   sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context    luffa;
+#else
   hashState_luffa         luffa;
+#endif
   cubehashParam           cube;
   sph_shavite512_context  shavite;
   hashState_sd            simd;
@@ -63,7 +71,11 @@ void init_x11gost_ctx()
   sph_keccak512_init( &x11gost_ctx.keccak );
   sph_gost512_init( &x11gost_ctx.gost );
   sph_shavite512_init( &x11gost_ctx.shavite );
+#if defined(__aarch64__)
+   sph_luffa512_init(&x11gost_ctx.luffa );
+#else
   init_luffa( &x11gost_ctx.luffa, 512 );
+#endif
   cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
   init_sd( &x11gost_ctx.simd, 512 );
 }
@@ -102,8 +114,14 @@ void x11gost_hash(void *output, const void *input)
    sph_gost512( &ctx.gost, hash, 64 );
    sph_gost512_close( &ctx.gost, hash );

+#if defined(__aarch64__)
+    sph_luffa512_init(&ctx.luffa );
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, 64 );
+#endif

    cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                (const byte*)hash, 64 );
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -16,13 +16,17 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
   sph_blake512_context    blake;
@@ -37,7 +41,11 @@ typedef struct {
   sph_groestl512_context   groestl;
   sph_echo512_context      echo;
 #endif
-   hashState_luffa          luffa;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
+   hashState_luffa         luffa;
+#endif
   cubehashParam            cubehash;
   sph_shavite512_context   shavite;
   hashState_sd             simd;
@@ -60,7 +68,11 @@ void init_x12_ctx()
        sph_groestl512_init(&x12_ctx.groestl);
        sph_echo512_init(&x12_ctx.echo);
 #endif
-        init_luffa( &x12_ctx.luffa, 512 );
+#if defined(__aarch64__)
+   sph_luffa512_init(&x12_ctx.luffa );
+#else
+   init_luffa( &x12_ctx.luffa, 512 );
+#endif
        cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &x12_ctx.shavite );
        init_sd( &x12_ctx.simd, 512 );
@@ -82,8 +94,13 @@ void x12hash(void *output, const void *input)
   sph_bmw512(&ctx.bmw, hash, 64);
   sph_bmw512_close(&ctx.bmw, hash);

+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hashB);
+#else
   update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                           (const BitSequence*)hash, 64 );
+#endif

   cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                         (const byte*)hashB, 64 );
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -72,7 +72,7 @@ void phi1612_hash(void *output, const void *input)
     sph_jh512( &ctx.jh, (const void*)hash, 64 );
     sph_jh512_close( &ctx.jh, (void*)hash );

-     cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 );
+     cubehashUpdateDigest( &ctx.cube,  hash, hash, 64 );

 #if defined(__AES__)
     fugue512_Update( &ctx.fugue, hash, 512 ); 
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -38,7 +38,7 @@ void skunkhash( void *output, const void *input )
     sph_skein512( &ctx.skein, input+64, 16 );
     sph_skein512_close( &ctx.skein, (void*) hash );

-     cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );

 #if defined(__AES__)
     fugue512_Update( &ctx.fugue, hash, 512 ); 
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -26,6 +26,11 @@
  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
   sph_blake512_context blake;
@@ -42,7 +47,11 @@ typedef struct {
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
   sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
   hashState_luffa         luffa;
+#endif
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
   hashState_sd            simd;
@@ -67,7 +76,11 @@ void init_x13_ctx()
   sph_skein512_init( &x13_ctx.skein );
   sph_jh512_init( &x13_ctx.jh );
   sph_keccak512_init( &x13_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init(&x13_ctx.luffa );
+#else   
   init_luffa( &x13_ctx.luffa, 512 );
+#endif   
   cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x13_ctx.shavite );
   init_sd( &x13_ctx.simd, 512 );
@@ -103,8 +116,13 @@ void x13hash(void *output, const void *input)
    sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
    sph_keccak512_close( &ctx.keccak, hash );

+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );
+#endif

    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                              (const byte*)hash, 64 );
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -143,7 +143,6 @@ void x13sm3_hash(void *output, const void *input)
        sph_fugue512(&ctx.fugue, hash, 64);
        sph_fugue512_close(&ctx.fugue, hash);

-        asm volatile ("emms");
 	memcpy(output, hash, 32);
 }

--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -9,12 +9,16 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue//sph_fugue.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
 #ifdef __AES__
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
 	sph_skein512_context    skein;
@@ -24,7 +28,11 @@ typedef struct {
 #else
 	sph_echo512_context		echo;
 #endif
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
   hashState_luffa         luffa;
+#endif
 	sph_fugue512_context    fugue;
 	sph_gost512_context     gost;
 } poly_ctx_holder;
@@ -40,7 +48,11 @@ void init_polytimos_ctx()
 #else
   sph_echo512_init(&poly_ctx.echo);
 #endif
+#if defined(__aarch64__)
+   sph_luffa512_init(&poly_ctx.luffa );
+#else
   init_luffa( &poly_ctx.luffa, 512 );
+#endif
   sph_fugue512_init(&poly_ctx.fugue);
   sph_gost512_init(&poly_ctx.gost);
 }
@@ -65,8 +77,13 @@ void polytimos_hash(void *output, const void *input)
 	sph_echo512_close(&ctx.echo, hashA);
 #endif

-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
-                                (const BitSequence*)hashA, 64 );
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hashA, 64);
+    sph_luffa512_close(&ctx.luffa, hashA);
+#else
+    update_and_final_luffa( &ctx.luffa, hashA,
+                                hashA, 64 );
+#endif

 	sph_fugue512(&ctx.fugue, hashA, 64);
 	sph_fugue512_close(&ctx.fugue, hashA);
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -14,7 +14,6 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #if defined(__AES__)
@@ -26,6 +25,11 @@
  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
   sph_blake512_context blake;
@@ -42,7 +46,11 @@ typedef struct {
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
   sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
   hashState_luffa         luffa;
+#endif
   cubehashParam           cube;
   sph_shavite512_context  shavite;
   hashState_sd            simd;
@@ -68,7 +76,11 @@ void init_x14_ctx()
   sph_skein512_init( &x14_ctx.skein );
   sph_jh512_init( &x14_ctx.jh );
   sph_keccak512_init( &x14_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init( &x14_ctx.luffa );
+#else
   init_luffa( &x14_ctx.luffa,512 );
+#endif
   cubehashInit( &x14_ctx.cube,512,16,32 );
   sph_shavite512_init( &x14_ctx.shavite );
   init_sd( &x14_ctx.simd,512 );
@@ -105,8 +117,13 @@ void x14hash(void *output, const void *input)
    sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
    sph_keccak512_close( &ctx.keccak, hash );

+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, 64 );
+#endif

    cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                (const byte*)hash, 64 );
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -16,7 +16,6 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"

@@ -29,6 +28,11 @@
  #include "algo/echo/sph_echo.h"
  #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif

 typedef struct {
   sph_blake512_context blake;
@@ -45,7 +49,11 @@ typedef struct {
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
   sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
   hashState_luffa         luffa;
+#endif
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
   hashState_sd            simd;
@@ -72,7 +80,11 @@ void init_x15_ctx()
   sph_skein512_init( &x15_ctx.skein );
   sph_jh512_init( &x15_ctx.jh );
   sph_keccak512_init( &x15_ctx.keccak );
-   init_luffa( &x15_ctx.luffa, 512 );
+#if defined(__aarch64__)
+   sph_luffa512_init( &x15_ctx.luffa );
+#else
+   init_luffa( &x15_ctx.luffa,512 );
+#endif   
   cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x15_ctx.shavite );
   init_sd( &x15_ctx.simd, 512 );
@@ -112,8 +124,13 @@ void x15hash(void *output, const void *input)
    sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
    sph_keccak512_close( &ctx.keccak, hash );
   
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );
+#endif

    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                              (const byte*)hash, 64 );
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -86,13 +86,26 @@ int hex_hash( void* output, const void* input, int thrid )
         break;
         case LUFFA:
            if ( i == 0 )
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+            {
+#if defined(__aarch64__)
+              sph_luffa512(&ctx.luffa, (const void*) in+64, 16 );
+              sph_luffa512_close(&ctx.luffa, hash);
+#else
+              update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                          (const BitSequence*)in+64, 16 );
+#endif
+            }
            else
            {
+#if defined(__aarch64__)
+              sph_luffa512_init(&ctx.luffa );
+              sph_luffa512(&ctx.luffa, (const void*) in, size );
+              sph_luffa512_close(&ctx.luffa, hash);
+#else
               init_luffa( &ctx.luffa, 512 );
               update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                             (const BitSequence*)in, size );
+#endif
            }
            break;
         case CUBEHASH:
@@ -192,7 +205,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   if ( bench )  ptarget[7] = 0x0cff;

-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
   
   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = swab32(pdata[17]);
@@ -218,8 +231,13 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
         sph_skein512( &hex_ctx.skein, edata, 64 );
      break;
      case LUFFA:
+#if defined(__aarch64__)
+         sph_luffa512_init(&hex_ctx.luffa );
+         sph_luffa512(&hex_ctx.luffa, (const void*) edata, 64);
+#else
         init_luffa( &hex_ctx.luffa, 512 );
         update_luffa( &hex_ctx.luffa, (const BitSequence*)edata, 64 );
+#endif
      break;
      case CUBEHASH:
         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
--- a/Show More
+++ b/Show More