From 31c4dedf5945d46d76104d6d0e92658ada0ec694 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Fri, 6 Oct 2023 22:18:09 -0400
Subject: [PATCH] v3.23.4

---
 Makefile.am                                   |    5 +-
 RELEASE_NOTES                                 |   10 +
 algo-gate-api.c                               |    2 +-
 algo-gate-api.h                               |    6 +-
 algo/argon2/argon2a/ar2/opt.c                 |    4 +
 algo/argon2/argon2d/argon2d-gate.c            |    4 +-
 algo/argon2/argon2d/argon2d/opt.c             |   26 +-
 algo/argon2/argon2d/blake2/blamka-round-opt.h |  101 +-
 algo/blake/blake-4way.c                       |    2 +-
 algo/blake/blake256-hash.c                    |  210 +-
 algo/blake/blake2b.c                          |    2 +-
 algo/blake/blake2s-hash.c                     |  129 +-
 algo/blake/blake2s-hash.h                     |   58 +-
 algo/blake/blake2s.c                          |   23 +-
 algo/blake/blake512-hash.c                    |   72 +-
 algo/blake/blakecoin-4way.c                   |    2 +-
 algo/blake/sph-blake2s.c                      |   54 +-
 algo/blake/sph-blake2s.h                      |   21 +-
 algo/blake/sph_blake2b.c                      |   38 +
 algo/cubehash/cubehash_sse2.c                 |  236 +-
 algo/cubehash/cubehash_sse2.h                 |   20 +-
 algo/echo/aes_ni/hash.c                       |    3 +
 algo/echo/aes_ni/hash_api.h                   |   10 +-
 algo/groestl/aes_ni/hash-groestl.h            |    7 +-
 algo/groestl/aes_ni/hash-groestl256.h         |    6 +-
 algo/groestl/groestl256-hash-4way.h           |    1 -
 algo/groestl/groestl512-hash-4way.h           |    1 -
 algo/groestl/myrgr-4way.c                     |    2 +-
 algo/haval/haval-4way-helper.c                |   14 +-
 algo/haval/haval-hash-4way.c                  |  165 +-
 algo/haval/haval-hash-4way.h                  |    6 +-
 algo/hodl/aes.c                               |    5 +-
 algo/hodl/hodl-gate.c                         |    1 +
 algo/hodl/hodl-wolf.c                         |    2 +-
 algo/hodl/hodl-wolf.h                         |    4 +-
 algo/hodl/sha512-avx.h                        |    6 +-
 algo/hodl/wolf-aes.h                          |   12 +-
 algo/keccak/keccak-4way.c                     |   16 +-
 algo/keccak/keccak-hash-4way.c                |   18 +-
 algo/keccak/keccak-hash-4way.h                |  112 +-
 algo/keccak/sha3d-4way.c                      |   24 +-
 algo/luffa/luffa-hash-2way.c                  |    1 -
 algo/luffa/luffa_for_sse2.c                   |  463 ++--
 algo/luffa/luffa_for_sse2.h                   |   22 +-
 algo/lyra2/lyra2-gate.c                       |    2 +-
 algo/lyra2/lyra2h-4way.c                      |    2 +-
 algo/lyra2/lyra2rev2-4way.c                   |    7 +-
 algo/lyra2/lyra2rev3-4way.c                   |    2 +-
 algo/lyra2/lyra2rev3.c                        |   10 +-
 algo/lyra2/lyra2z-4way.c                      |    2 +-
 algo/lyra2/lyra2z.c                           |    1 -
 algo/lyra2/lyra2z330.c                        |   11 +-
 algo/lyra2/sponge-2way.c                      |    2 +-
 algo/lyra2/sponge.c                           |    2 +-
 algo/lyra2/sponge.h                           |    4 +-
 algo/panama/panama-hash-4way.c                |   90 +-
 algo/panama/panama-hash-4way.h                |    4 +-
 algo/qubit/deep.c                             |   13 +-
 algo/qubit/qubit.c                            |    1 -
 algo/ripemd/lbry.c                            |    1 +
 algo/scrypt/scrypt-core-4way.c                |  554 ++---
 algo/scrypt/scrypt-core-4way.h                |    8 +-
 algo/scrypt/scrypt.c                          |  124 +-
 algo/sha/hmac-sha256-hash-4way.c              |    3 +
 algo/sha/hmac-sha256-hash-4way.h              |    9 +-
 algo/sha/sha2.c                               |    3 +
 algo/sha/sha256-hash-4way.c                   |  373 ++-
 algo/sha/sha256-hash.c                        | 2011 ++++++++---------
 algo/sha/sha256-hash.h                        |   84 +-
 algo/sha/sha256d-4way.c                       |  138 +-
 algo/sha/sha256d-4way.h                       |    9 +
 algo/sha/sha256dt.c                           |  147 +-
 algo/sha/sha256q-4way.c                       |    6 +-
 algo/sha/sha256q.c                            |    2 +-
 algo/sha/sha256t-4way.c                       |  142 +-
 algo/sha/sha256t-gate.c                       |   18 +-
 algo/sha/sha256t-gate.h                       |   13 +
 algo/shabal/shabal-hash-4way.c                |  260 +--
 algo/shabal/shabal-hash-4way.h                |    6 +-
 algo/shavite/shavite-hash.h                   |  315 +++
 algo/shavite/sph-shavite-aesni.c              |  383 ++--
 algo/shavite/sph_shavite.h                    |    2 +-
 algo/simd/vector.c                            |    5 +
 algo/simd/vector.h                            |  227 +-
 algo/sm3/sm3-hash-4way.h                      |    4 +-
 algo/swifftx/swifftx.c                        |   96 +-
 algo/verthash/verthash-gate.c                 |    2 +-
 algo/x11/c11.c                                |   21 +-
 algo/x11/timetravel-4way.c                    |   16 +-
 algo/x11/timetravel.c                         |   47 +-
 algo/x11/timetravel10-4way.c                  |   16 +-
 algo/x11/timetravel10.c                       |   29 +-
 algo/x11/x11.c                                |   21 +-
 algo/x11/x11evo.c                             |   23 +-
 algo/x11/x11gost.c                            |   20 +-
 algo/x12/x12.c                                |   23 +-
 algo/x13/phi1612.c                            |    2 +-
 algo/x13/skunk.c                              |    2 +-
 algo/x13/x13.c                                |   18 +
 algo/x13/x13sm3.c                             |    1 -
 algo/x14/polytimos.c                          |   23 +-
 algo/x14/x14.c                                |   19 +-
 algo/x15/x15.c                                |   21 +-
 algo/x16/hex.c                                |   22 +-
 algo/x16/minotaur.c                           |   18 +-
 algo/x16/x16r-4way.c                          |   18 +-
 algo/x16/x16r-gate.h                          |   53 +-
 algo/x16/x16r.c                               |   21 +-
 algo/x16/x16rt.c                              |    2 +-
 algo/x16/x16rv2-4way.c                        |   14 +-
 algo/x16/x16rv2.c                             |   20 +-
 algo/x16/x21s.c                               |    3 +-
 algo/x17/sonoa.c                              |   49 +-
 algo/x17/x17.c                                |    2 +-
 algo/x17/xevan.c                              |   27 +-
 algo/x22/x22i.c                               |   18 +-
 algo/x22/x25x.c                               |   19 +-
 algo/yespower/yespower-blake2b-ref.c          |  593 +++++
 algo/yespower/yespower-gate.c                 |   57 +-
 algo/yespower/yespower-opt.c                  |    4 +
 algo/yespower/yespower-ref.c                  |   10 +-
 algo/yespower/yespower.h                      |   15 +
 build-allarch.sh                              |   36 +-
 build-armv8.sh                                |   15 +
 build-avx2.sh                                 |    2 +-
 build.sh                                      |    2 +-
 clean-all.sh                                  |    4 +-
 compat/sha3-defs.h                            |    1 -
 configure                                     |   20 +-
 configure.ac                                  |    2 +-
 configure~                                    |   20 +-
 cpu-miner.c                                   |  162 +-
 miner.h                                       |   20 +-
 simd-utils.h                                  |   45 +-
 simd-utils/intrlv.h                           |  180 +-
 simd-utils/simd-128.h                         |  192 +-
 simd-utils/simd-256.h                         |    2 +-
 simd-utils/simd-512.h                         |    2 +-
 simd-utils/simd-64.h                          |    2 +-
 simd-utils/simd-int.h                         |   69 +
 simd-utils/simd-neon.h                        |  242 ++
 sysinfos.c                                    |  360 ++-
 util.c                                        |    6 +-
 winbuild-cross.sh                             |    2 +-
 144 files changed, 5931 insertions(+), 3746 deletions(-)
 create mode 100644 algo/shavite/shavite-hash.h
 create mode 100644 algo/yespower/yespower-blake2b-ref.c
 create mode 100755 build-armv8.sh
 create mode 100644 simd-utils/simd-neon.h

diff --git a/Makefile.am b/Makefile.am
index eef6e10..2b61285 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -280,7 +280,10 @@ cpuminer_SOURCES = \
   algo/yespower/yespower-blake2b.c \
   algo/yespower/crypto/hmac-blake2b.c \
   algo/yespower/yescrypt-r8g.c \
-  algo/yespower/yespower-opt.c
+  algo/yespower/yespower-opt.c \
+  algo/yespower/yespower-ref.c \
+  algo/yespower/yespower-blake2b-ref.c
+  
 
 disable_flags =
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 0633965..d169a9d 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,8 +65,18 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.23.4
+
+Source code only. 
+
+Initial experimental support for ARM AArch64 with NEON, SHA2 & AES.
+Arm support is considered alpha quality, use at your own risk.
+x86_64 is unaffected.
+
 v3.23.3
 
+#402: Windows binaries package rebuilt with openssl v1.1.1w (libcrypto-1_1-x64.dll).
+
 #400: Removed excessive thread restarts when mining solo.
 Fixed build_msys2.sh for gcc-13 by removing unsupported option "--param=evrp-mode=legacy" from CFLAGS.
 Added CPUID detection and reporting of CPUs and SW builds supporting SHA512 extension.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index e86b304..4c29c8c 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -109,7 +109,7 @@ int scanhash_generic( struct work *work, uint32_t max_nonce,
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
    do
    {
       edata[19] = n;
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 6f2985b..4c90958 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -89,15 +89,15 @@
 typedef  uint32_t set_t;
 
 #define EMPTY_SET        0
-#define SSE2_OPT         1
+#define SSE2_OPT         1   // Core2, NEON
 #define AES_OPT          2  
 #define SSE42_OPT        4
 #define AVX_OPT          8   // Sandybridge
 #define AVX2_OPT      0x10   // Haswell, Zen1
-#define SHA_OPT       0x20   // Zen1, Icelake (deprecated)
+#define SHA_OPT       0x20   // Zen1, Icelake. NEON
 #define AVX512_OPT    0x40   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
 #define VAES_OPT      0x80   // Icelake, Zen3
-#define SHA512_OPT   0x100   // Lunar Lake, Arrow Lake
+#define SHA512_OPT   0x100   // Lunar Lake, Arrow Lake, NEON
 
 // AVX10 does not have explicit algo features:
 //  AVX10_512 is compatible with AVX512 + VAES
diff --git a/algo/argon2/argon2a/ar2/opt.c b/algo/argon2/argon2a/ar2/opt.c
index feda867..3115cd1 100644
--- a/algo/argon2/argon2a/ar2/opt.c
+++ b/algo/argon2/argon2a/ar2/opt.c
@@ -17,6 +17,8 @@
 #include <stdio.h>
 #include <inttypes.h>
 
+#if defined(__SSE2__)
+
 #include <immintrin.h>
 
 #include "argon2.h"
@@ -183,3 +185,5 @@ void ar2_fill_segment(const argon2_instance_t *instance,
 
     free(pseudo_rands);
 }
+
+#endif
diff --git a/algo/argon2/argon2d/argon2d-gate.c b/algo/argon2/argon2d/argon2d-gate.c
index cd41a32..6693299 100644
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -114,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
    uint32_t nonce = first_nonce;
    const bool bench = opt_benchmark;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
    do
    {
       edata[19] = nonce;
@@ -160,7 +160,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
    uint32_t parallelism = 1; // 1 thread, 2 lanes
    const bool bench = opt_benchmark;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
 
    do {
       edata[19] = n;
diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c
index 5164a1e..fd00aba 100644
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -131,22 +131,22 @@ static void fill_block(__m256i *state, const block *ref_block,
 
 #else  // SSE2
 
-static void fill_block(__m128i *state, const block *ref_block,
+static void fill_block( v128_t *state, const block *ref_block,
                        block *next_block, int with_xor) {
-    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
+    v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
     unsigned int i;
 
     if (with_xor) {
         for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-            state[i] = _mm_xor_si128(
-                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
-            block_XY[i] = _mm_xor_si128(
-                state[i], _mm_load_si128((const __m128i *)next_block->v + i));
+            state[i] = v128_xor(
+                state[i], v128_load((const v128_t *)ref_block->v + i));
+            block_XY[i] = v128_xor(
+                state[i], v128_load((const v128_t *)next_block->v + i));
         }
     } else {
         for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = _mm_xor_si128(
-                state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
+            block_XY[i] = state[i] = v128_xor(
+                state[i], v128_load((const v128_t *)ref_block->v + i));
         }
     }
 
@@ -185,8 +185,8 @@ static void fill_block(__m128i *state, const block *ref_block,
                   state[39], state[47], state[55], state[63] );
 
     for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-        state[i] = _mm_xor_si128(state[i], block_XY[i]);
-        _mm_store_si128((__m128i *)next_block->v + i, state[i]);
+        state[i] = v128_xor(state[i], block_XY[i]);
+        v128_store((v128_t *)next_block->v + i, state[i]);
     }
 }
 
@@ -202,8 +202,8 @@ static void next_addresses(block *address_block, block *input_block) {
     __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
     __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
 #else
-    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
-    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+    v128_t zero_block[ARGON2_OWORDS_IN_BLOCK];
+    v128_t zero2_block[ARGON2_OWORDS_IN_BLOCK];
 #endif
 
     memset(zero_block, 0, sizeof(zero_block));
@@ -232,7 +232,7 @@ void fill_segment(const argon2_instance_t *instance,
 #elif defined(__AVX2__)
     __m256i state[ARGON2_HWORDS_IN_BLOCK];
 #else
-    __m128i state[ARGON2_OWORDS_IN_BLOCK];
+    v128_t state[ARGON2_OWORDS_IN_BLOCK];
 #endif
 //    int data_independent_addressing;
 
diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h
index 4cb8bda..77f9b22 100644
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -19,16 +19,6 @@
 #define BLAKE_ROUND_MKA_OPT_H
 
 #include "blake2-impl.h"
-
-#include <emmintrin.h>
-#if defined(__SSSE3__)
-#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
-#endif
-
-#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
-#include <x86intrin.h>
-#endif
-
 #include "simd-utils.h"
 
 #if !defined(__AVX512F__)
@@ -39,7 +29,7 @@
     (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
 #define r24                                                                    \
     (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
-#define _mm_roti_epi64(x, c)                                                   \
+#define v128_ror64(x, c)                                                   \
     (-(c) == 32)                                                               \
         ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
         : (-(c) == 24)                                                         \
@@ -47,20 +37,20 @@
               : (-(c) == 16)                                                   \
                     ? _mm_shuffle_epi8((x), r16)                               \
                     : (-(c) == 63)                                             \
-                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
-                                          _mm_add_epi64((x), (x)))             \
-                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
-                                          _mm_slli_epi64((x), 64 - (-(c))))
+                          ? v128_xor(v128_sr64((x), -(c)),           \
+                                          v128_add64((x), (x)))             \
+                          : v128_xor(v128_sr64((x), -(c)),           \
+                                          v128_sl64((x), 64 - (-(c))))
 #else /* defined(__SSE2__) */
-#define _mm_roti_epi64(r, c)                                                   \
-    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#define v128_ror64(r, c)                                                   \
+    v128_xor(v128_sr64((r), -(c)), v128_sl64((r), 64 - (-(c))))
 #endif
 #else
 #endif
 
-static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
-    const __m128i z = _mm_mul_epu32(x, y);
-    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
+    const v128_t z = v128_mul32(x, y);
+    return v128_add64(v128_add64(x, y), v128_add64(z, z));
 }
 
 #define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
@@ -68,20 +58,20 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
         A0 = fBlaMka(A0, B0);                                                  \
         A1 = fBlaMka(A1, B1);                                                  \
                                                                                \
-        D0 = _mm_xor_si128(D0, A0);                                            \
-        D1 = _mm_xor_si128(D1, A1);                                            \
+        D0 = v128_xor(D0, A0);                                            \
+        D1 = v128_xor(D1, A1);                                            \
                                                                                \
-        D0 = _mm_roti_epi64(D0, -32);                                          \
-        D1 = _mm_roti_epi64(D1, -32);                                          \
+        D0 = v128_ror64(D0, -32);                                          \
+        D1 = v128_ror64(D1, -32);                                          \
                                                                                \
         C0 = fBlaMka(C0, D0);                                                  \
         C1 = fBlaMka(C1, D1);                                                  \
                                                                                \
-        B0 = _mm_xor_si128(B0, C0);                                            \
-        B1 = _mm_xor_si128(B1, C1);                                            \
+        B0 = v128_xor(B0, C0);                                            \
+        B1 = v128_xor(B1, C1);                                            \
                                                                                \
-        B0 = _mm_roti_epi64(B0, -24);                                          \
-        B1 = _mm_roti_epi64(B1, -24);                                          \
+        B0 = v128_ror64(B0, -24);                                          \
+        B1 = v128_ror64(B1, -24);                                          \
     } while ((void)0, 0)
 
 #define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
@@ -89,27 +79,27 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
         A0 = fBlaMka(A0, B0);                                                  \
         A1 = fBlaMka(A1, B1);                                                  \
                                                                                \
-        D0 = _mm_xor_si128(D0, A0);                                            \
-        D1 = _mm_xor_si128(D1, A1);                                            \
+        D0 = v128_xor(D0, A0);                                            \
+        D1 = v128_xor(D1, A1);                                            \
                                                                                \
-        D0 = _mm_roti_epi64(D0, -16);                                          \
-        D1 = _mm_roti_epi64(D1, -16);                                          \
+        D0 = v128_ror64(D0, -16);                                          \
+        D1 = v128_ror64(D1, -16);                                          \
                                                                                \
         C0 = fBlaMka(C0, D0);                                                  \
         C1 = fBlaMka(C1, D1);                                                  \
                                                                                \
-        B0 = _mm_xor_si128(B0, C0);                                            \
-        B1 = _mm_xor_si128(B1, C1);                                            \
+        B0 = v128_xor(B0, C0);                                            \
+        B1 = v128_xor(B1, C1);                                            \
                                                                                \
-        B0 = _mm_roti_epi64(B0, -63);                                          \
-        B1 = _mm_roti_epi64(B1, -63);                                          \
+        B0 = v128_ror64(B0, -63);                                          \
+        B1 = v128_ror64(B1, -63);                                          \
     } while ((void)0, 0)
 
 #if defined(__SSSE3__)
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
     do {                                                                       \
-        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
-        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        v128_t t0 = v128_alignr8(B1, B0, 8);                               \
+        v128_t t1 = v128_alignr8(B0, B1, 8);                               \
         B0 = t0;                                                               \
         B1 = t1;                                                               \
                                                                                \
@@ -117,16 +107,16 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
         C0 = C1;                                                               \
         C1 = t0;                                                               \
                                                                                \
-        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
-        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t0 = v128_alignr8(D1, D0, 8);                                       \
+        t1 = v128_alignr8(D0, D1, 8);                                       \
         D0 = t1;                                                               \
         D1 = t0;                                                               \
     } while ((void)0, 0)
 
 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
     do {                                                                       \
-        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
-        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        v128_t t0 = v128_alignr8(B0, B1, 8);                               \
+        v128_t t1 = v128_alignr8(B1, B0, 8);                               \
         B0 = t0;                                                               \
         B1 = t1;                                                               \
                                                                                \
@@ -134,37 +124,37 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
         C0 = C1;                                                               \
         C1 = t0;                                                               \
                                                                                \
-        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
-        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t0 = v128_alignr8(D0, D1, 8);                                       \
+        t1 = v128_alignr8(D1, D0, 8);                                       \
         D0 = t1;                                                               \
         D1 = t0;                                                               \
     } while ((void)0, 0)
 #else /* SSE2 */
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
     do {                                                                       \
-        __m128i t0 = D0;                                                       \
-        __m128i t1 = B0;                                                       \
+        v128_t t0 = D0;                                                       \
+        v128_t t1 = B0;                                                       \
         D0 = C0;                                                               \
         C0 = C1;                                                               \
         C1 = D0;                                                               \
-        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
-        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
-        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
-        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
+        D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0));               \
+        D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1));               \
+        B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1));               \
+        B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1));               \
     } while ((void)0, 0)
 
 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
     do {                                                                       \
-        __m128i t0, t1;                                                        \
+        v128_t t0, t1;                                                        \
         t0 = C0;                                                               \
         C0 = C1;                                                               \
         C1 = t0;                                                               \
         t0 = B0;                                                               \
         t1 = D0;                                                               \
-        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
-        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
-        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
-        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
+        B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0));               \
+        B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1));               \
+        D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1));               \
+        D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1));               \
     } while ((void)0, 0)
 #endif
 
@@ -462,4 +452,5 @@ static inline __m512i muladd(__m512i x, __m512i y)
     } while ((void)0, 0)
 
 #endif /* __AVX512F__ */
+
 #endif /* BLAKE_ROUND_MKA_OPT_H */
diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c
index d318653..6671bfa 100644
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -34,7 +34,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
    if (opt_benchmark)
       HTarget = 0x7f;
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
    blake256r14_4way_init( &blake_4w_ctx );
    blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
 
diff --git a/algo/blake/blake256-hash.c b/algo/blake/blake256-hash.c
index 6e22514..0647113 100644
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -277,56 +277,56 @@ static const unsigned sigma[16][16] = {
 
 #define BLAKE256_ROUND( r ) \
 { \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \
                                           CSx( r, 5 ) ^ Mx( r, 4 ), \
                                           CSx( r, 3 ) ^ Mx( r, 2 ), \
                                           CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
-   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
+   V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \
                                           CSx( r, 4 ) ^ Mx( r, 5 ), \
                                           CSx( r, 2 ) ^ Mx( r, 3 ), \
                                           CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
-   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
-   V0 = mm128_shufll_32( V0 ); \
-   V3 = mm128_swap_64( V3 ); \
-   V2 = mm128_shuflr_32( V2 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
+   V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
+   V0 = v128_shufll32( V0 ); \
+   V3 = v128_swap64( V3 ); \
+   V2 = v128_shuflr32( V2 ); \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, D ) ^ Mx( r, C ), \
                                           CSx( r, B ) ^ Mx( r, A ), \
                                           CSx( r, 9 ) ^ Mx( r, 8 ), \
                                           CSx( r, F ) ^ Mx( r, E ) ) ) ); \
-   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
+   V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
+   V0 = v128_add32( V0, v128_add32( V1, \
+                           v128_set_32( CSx( r, C ) ^ Mx( r, D ), \
                                           CSx( r, A ) ^ Mx( r, B ), \
                                           CSx( r, 8 ) ^ Mx( r, 9 ), \
                                           CSx( r, E ) ^ Mx( r, F ) ) ) ); \
-   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
-   V0 = mm128_shuflr_32( V0 ); \
-   V3 = mm128_swap_64( V3 ); \
-   V2 = mm128_shufll_32( V2 ); \
+   V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
+   V2 = v128_add32( V2, V3 ); \
+   V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
+   V0 = v128_shuflr32( V0 ); \
+   V3 = v128_swap64( V3 ); \
+   V2 = v128_shufll32( V2 ); \
 }
 
 // Default is 14 rounds, blakecoin & vanilla are 8.
 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                             const uint32_t T0, const uint32_t T1, int rounds )
 {
-   __m128i V0, V1, V2, V3;
+   v128_t V0, V1, V2, V3;
    uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
-   V0 = casti_m128i( H, 0 );
-   V1 = casti_m128i( H, 1 );
-   V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
-   V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
+   V0 = casti_v128( H, 0 );
+   V1 = casti_v128( H, 1 );
+   V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
+   V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
                        T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
    M0 = buf[ 0];
    M1 = buf[ 1];
@@ -361,8 +361,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
       BLAKE256_ROUND( 2 );
       BLAKE256_ROUND( 3 );
    }
-   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
-   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
+   casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V0, V2 ) );
+   casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V1, V3 ) );
 }
 
 ////////////////////////////////////////////
@@ -371,16 +371,16 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 
 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 { \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
-                      _mm_xor_si128( v128_32( c1 ), m0 ) ); \
-   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), \
-                      _mm_xor_si128( v128_32( c0 ), m1 ) ); \
-   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
+   a = v128_add32( v128_add32( a, b ), \
+                      v128_xor( v128_32( c1 ), m0 ) ); \
+   d = v128_swap32_16( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ), 12 ); \
+   a = v128_add32( v128_add32( a, b ), \
+                      v128_xor( v128_32( c0 ), m1 ) ); \
+   d = v128_shuflr32_8( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ), 7 ); \
 }
 
 #define ROUND_S_4WAY(r) \
@@ -396,31 +396,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 }
 
 #define DECL_STATE32_4WAY \
-	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
+	v128_t H0, H1, H2, H3, H4, H5, H6, H7; \
         uint32_t T0, T1;
 
 #define READ_STATE32_4WAY(state)   do { \
-		H0 = casti_m128i( state->H, 0 ); \
-		H1 = casti_m128i( state->H, 1 ); \
-		H2 = casti_m128i( state->H, 2 ); \
-		H3 = casti_m128i( state->H, 3 ); \
-		H4 = casti_m128i( state->H, 4 ); \
-		H5 = casti_m128i( state->H, 5 ); \
-		H6 = casti_m128i( state->H, 6 ); \
-		H7 = casti_m128i( state->H, 7 ); \
+		H0 = casti_v128( state->H, 0 ); \
+		H1 = casti_v128( state->H, 1 ); \
+		H2 = casti_v128( state->H, 2 ); \
+		H3 = casti_v128( state->H, 3 ); \
+		H4 = casti_v128( state->H, 4 ); \
+		H5 = casti_v128( state->H, 5 ); \
+		H6 = casti_v128( state->H, 6 ); \
+		H7 = casti_v128( state->H, 7 ); \
 		T0 = (state)->T0; \
 		T1 = (state)->T1; \
 	} while (0)
 
 #define WRITE_STATE32_4WAY(state)   do { \
-		casti_m128i( state->H, 0 ) = H0; \
-		casti_m128i( state->H, 1 ) = H1; \
-		casti_m128i( state->H, 2 ) = H2; \
-		casti_m128i( state->H, 3 ) = H3; \
-		casti_m128i( state->H, 4 ) = H4; \
-		casti_m128i( state->H, 5 ) = H5; \
-		casti_m128i( state->H, 6 ) = H6; \
-		casti_m128i( state->H, 7 ) = H7; \
+		casti_v128( state->H, 0 ) = H0; \
+		casti_v128( state->H, 1 ) = H1; \
+		casti_v128( state->H, 2 ) = H2; \
+		casti_v128( state->H, 3 ) = H3; \
+		casti_v128( state->H, 4 ) = H4; \
+		casti_v128( state->H, 5 ) = H5; \
+		casti_v128( state->H, 6 ) = H6; \
+		casti_v128( state->H, 7 ) = H7; \
 		(state)->T0 = T0; \
 		(state)->T1 = T1; \
 	} while (0)
@@ -430,7 +430,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 
 #define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
-   __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
+   v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
                                           0x0405060700010203 ); \
    M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
    M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
@@ -454,32 +454,32 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 
 #define BLAKE256_4WAY_BLOCK_BSWAP32 \
 { \
-   M0 = mm128_bswap_32( buf[0] ); \
-   M1 = mm128_bswap_32( buf[1] ); \
-   M2 = mm128_bswap_32( buf[2] ); \
-   M3 = mm128_bswap_32( buf[3] ); \
-   M4 = mm128_bswap_32( buf[4] ); \
-   M5 = mm128_bswap_32( buf[5] ); \
-   M6 = mm128_bswap_32( buf[6] ); \
-   M7 = mm128_bswap_32( buf[7] ); \
-   M8 = mm128_bswap_32( buf[8] ); \
-   M9 = mm128_bswap_32( buf[9] ); \
-   MA = mm128_bswap_32( buf[10] ); \
-   MB = mm128_bswap_32( buf[11] ); \
-   MC = mm128_bswap_32( buf[12] ); \
-   MD = mm128_bswap_32( buf[13] ); \
-   ME = mm128_bswap_32( buf[14] ); \
-   MF = mm128_bswap_32( buf[15] ); \
+   M0 = v128_bswap32( buf[0] ); \
+   M1 = v128_bswap32( buf[1] ); \
+   M2 = v128_bswap32( buf[2] ); \
+   M3 = v128_bswap32( buf[3] ); \
+   M4 = v128_bswap32( buf[4] ); \
+   M5 = v128_bswap32( buf[5] ); \
+   M6 = v128_bswap32( buf[6] ); \
+   M7 = v128_bswap32( buf[7] ); \
+   M8 = v128_bswap32( buf[8] ); \
+   M9 = v128_bswap32( buf[9] ); \
+   MA = v128_bswap32( buf[10] ); \
+   MB = v128_bswap32( buf[11] ); \
+   MC = v128_bswap32( buf[12] ); \
+   MD = v128_bswap32( buf[13] ); \
+   ME = v128_bswap32( buf[14] ); \
+   MF = v128_bswap32( buf[15] ); \
 }
 
 #endif  // SSSE3 else SSE2
 
 #define COMPRESS32_4WAY( rounds ) \
 { \
-   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
-   __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
-   __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
-   __m128i V8, V9, VA, VB, VC, VD, VE, VF; \
+   v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
+   v128_t M8, M9, MA, MB, MC, MD, ME, MF; \
+   v128_t V0, V1, V2, V3, V4, V5, V6, V7; \
+   v128_t V8, V9, VA, VB, VC, VD, VE, VF; \
    V0 = H0; \
    V1 = H1; \
    V2 = H2; \
@@ -514,14 +514,14 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
       ROUND_S_4WAY(2); \
       ROUND_S_4WAY(3); \
    } \
-   H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
-   H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
-   H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
-   H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
-   H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
-   H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
-   H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
-   H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
+   H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \
+   H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \
+   H2 = v128_xor( v128_xor( VA, V2 ), H2 ); \
+   H3 = v128_xor( v128_xor( VB, V3 ), H3 ); \
+   H4 = v128_xor( v128_xor( VC, V4 ), H4 ); \
+   H5 = v128_xor( v128_xor( VD, V5 ), H5 ); \
+   H6 = v128_xor( v128_xor( VE, V6 ), H6 ); \
+   H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \
 }
 
 #if defined (__AVX2__)
@@ -1867,14 +1867,14 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                    const uint32_t *salt, int rounds )
 {
-   casti_m128i( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
-   casti_m128i( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
-   casti_m128i( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
-   casti_m128i( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
-   casti_m128i( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
-   casti_m128i( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
-   casti_m128i( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
-   casti_m128i( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
+   casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
+   casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
+   casti_v128( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
+   casti_v128( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
+   casti_v128( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
+   casti_v128( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
+   casti_v128( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
+   casti_v128( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
    ctx->T0 = ctx->T1 = 0;
    ctx->ptr = 0;
    ctx->rounds = rounds;
@@ -1884,7 +1884,7 @@ static void
 blake32_4way( blake_4way_small_context *ctx, const void *data,
               size_t len )
 {
-   __m128i *buf = (__m128i*)ctx->buf;
+   v128_t *buf = (v128_t*)ctx->buf;
    size_t  bptr = ctx->ptr<<2;
    size_t  vptr = ctx->ptr >> 2;
    size_t  blen = len << 2;
@@ -1925,7 +1925,7 @@ static void
 blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
                void *dst, size_t out_size_w32 )
 {
-   __m128i buf[16] __attribute__ ((aligned (64)));
+   v128_t buf[16] __attribute__ ((aligned (64)));
    size_t   ptr     = ctx->ptr;
    size_t   vptr    = ctx->ptr>>2;
    unsigned bit_len = ( (unsigned)ptr << 3 );
@@ -1949,26 +1949,26 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
 
    if ( vptr < 12 )
    {
-      memset_zero_128( buf + vptr + 1, 13 - vptr  );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
+      v128_memset_zero( buf + vptr + 1, 13 - vptr  );
+      buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
       buf[ 14 ] = v128_32( bswap_32( th ) );
       buf[ 15 ] = v128_32( bswap_32( tl ) );
       blake32_4way( ctx, buf + vptr, 64 - ptr );
    }
    else
    {
-      memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
+      v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 );
       blake32_4way( ctx, buf + vptr, 64 - ptr );
       ctx->T0 = 0xFFFFFE00UL;
       ctx->T1 = 0xFFFFFFFFUL;
-      memset_zero_128( buf, 56>>2 );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
+      v128_memset_zero( buf, 56>>2 );
+      buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
       buf[ 14 ] = v128_32( bswap_32( th ) );
       buf[ 15 ] = v128_32( bswap_32( tl ) );
       blake32_4way( ctx, buf, 64 );
    }
 
-   mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
+   v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H );
 }
 
 #if defined (__AVX2__)
diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c
index 0e25e4a..7707366 100644
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -138,7 +138,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
 
-   mm128_bswap32_80( endiandata, pdata );
+   v128_bswap32_80( endiandata, pdata );
 
    do {
       endiandata[19] = n;
diff --git a/algo/blake/blake2s-hash.c b/algo/blake/blake2s-hash.c
index 01bb85c..c644b32 100644
--- a/algo/blake/blake2s-hash.c
+++ b/algo/blake/blake2s-hash.c
@@ -12,13 +12,13 @@
  */
 
 #include "blake2s-hash.h"
-
+#include "simd-utils.h"
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 
 //#if defined(__SSE4_2__)
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
 
 /*
 static const uint32_t blake2s_IV[8] =
@@ -78,43 +78,43 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
 
    /* IV XOR ParamBlock */
    for ( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
+      S->h[i] = v128_xor( S->h[i], v128_32( p[i] ) );
    return 0;
 }
 
-int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
+int blake2s_4way_compress( blake2s_4way_state *S, const v128_t* block )
 {
-   __m128i m[16];
-   __m128i v[16];
+   v128_t m[16];
+   v128_t v[16];
 
-   memcpy_128( m, block, 16 );
-   memcpy_128( v, S->h, 8 );
+   v128_memcpy( m, block, 16 );
+   v128_memcpy( v, S->h, 8 );
 
    v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
    v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
    v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
    v[11] = v128_64( 0xA54FF53AA54FF53AULL );
-   v[12] = _mm_xor_si128( v128_32( S->t[0] ),
+   v[12] = v128_xor( v128_32( S->t[0] ),
                           v128_64( 0x510E527F510E527FULL ) );
-   v[13] = _mm_xor_si128( v128_32( S->t[1] ),
+   v[13] = v128_xor( v128_32( S->t[1] ),
                           v128_64( 0x9B05688C9B05688CULL ) );
-   v[14] = _mm_xor_si128( v128_32( S->f[0] ),
+   v[14] = v128_xor( v128_32( S->f[0] ),
                           v128_64( 0x1F83D9AB1F83D9ABULL ) );
-   v[15] = _mm_xor_si128( v128_32( S->f[1] ),
+   v[15] = v128_xor( v128_32( S->f[1] ),
                           v128_64( 0x5BE0CD195BE0CD19ULL ) );
 
 #define G4W( sigma0, sigma1, a, b, c, d ) \
 do { \
    uint8_t s0 = sigma0; \
    uint8_t s1 = sigma1; \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
-   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi32( c, d ); \
-   b = mm128_ror_32( _mm_xor_si128( b, c ),  7 ); \
+   a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \
+   d = v128_swap32_16( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ), 12 ); \
+   a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \
+   d = v128_shuflr32_8( v128_xor( d, a ) ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ),  7 ); \
 } while(0)
 
 
@@ -143,7 +143,7 @@ do { \
    ROUND4W( 9 );
 
    for( size_t i = 0; i < 8; ++i )
-      S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
+      S->h[i] = v128_xor( v128_xor( S->h[i], v[i] ), v[i + 8] );
 
 #undef G4W
 #undef ROUND4W
@@ -175,26 +175,26 @@ do { \
 int blake2s_4way_update( blake2s_4way_state *S, const void *in,
                          uint64_t inlen )
 {
-   __m128i *input = (__m128i*)in;
-   __m128i *buf = (__m128i*)S->buf;
+   v128_t *input = (v128_t*)in;
+   v128_t *buf = (v128_t*)S->buf;
 
    while( inlen > 0 )
    {
       size_t left = S->buflen;
-      if( inlen >= BLAKE2S_BLOCKBYTES - left )
+      if( inlen >= 64 - left )
       {
-         memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
-         S->buflen += BLAKE2S_BLOCKBYTES - left;
-         S->t[0] += BLAKE2S_BLOCKBYTES;
-         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         v128_memcpy( buf + (left>>2), input, (64 - left) >> 2 );
+         S->buflen += 64 - left;
+         S->t[0] += 64;
+         S->t[1] += ( S->t[0] < 64 );
          blake2s_4way_compress( S, buf ); 
          S->buflen = 0;
-         input += ( BLAKE2S_BLOCKBYTES >> 2 );
-         inlen -= BLAKE2S_BLOCKBYTES;
+         input += ( 64 >> 2 );
+         inlen -= 64;
       }
       else
       {
-          memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
+          v128_memcpy( buf + ( left>>2 ), input, inlen>>2 );
           S->buflen += (size_t) inlen; 
           input += ( inlen>>2 );
           inlen -= inlen;
@@ -205,7 +205,7 @@ int blake2s_4way_update( blake2s_4way_state *S, const void *in,
 
 int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
 {
-   __m128i *buf = (__m128i*)S->buf;
+   v128_t *buf = (v128_t*)S->buf;
 
    S->t[0] += S->buflen;
    S->t[1] += ( S->t[0] < S->buflen );
@@ -213,12 +213,12 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
       S->f[1] = ~0U;
    S->f[0] = ~0U;
 
-   memset_zero_128( buf + ( S->buflen>>2 ),
-                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );      
+   v128_memset_zero( buf + ( S->buflen>>2 ),
+                    ( 64 - S->buflen ) >> 2 );      
    blake2s_4way_compress( S, buf );
 
    for ( int i = 0; i < 8; ++i )
-      casti_m128i( out, i ) = S->h[ i ];
+      casti_v128( out, i ) = S->h[ i ];
    return 0;
 }
 
@@ -226,24 +226,24 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
 int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
                               const void *input, uint64_t inlen )
 {
-    __m128i *in = (__m128i*)input;
-    __m128i *buf = (__m128i*)S->buf;
+    v128_t *in = (v128_t*)input;
+    v128_t *buf = (v128_t*)S->buf;
 
-    while( inlen > BLAKE2S_BLOCKBYTES )
+    while( inlen > 64 )
     {
-       memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-       S->buflen = BLAKE2S_BLOCKBYTES;
-       inlen -= BLAKE2S_BLOCKBYTES;
-       S->t[0] += BLAKE2S_BLOCKBYTES;
-       S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+       v128_memcpy( buf, in, 64 >> 2 );
+       S->buflen = 64;
+       inlen -= 64;
+       S->t[0] += 64;
+       S->t[1] += ( S->t[0] < 64 );
        blake2s_4way_compress( S, buf );
        S->buflen = 0;
-       in += ( BLAKE2S_BLOCKBYTES >> 2 );
+       in += ( 64 >> 2 );
     }
 
     // last block
-    memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-    S->buflen = BLAKE2S_BLOCKBYTES;
+    v128_memcpy( buf, in, 64 >> 2 );
+    S->buflen = 64;
     S->t[0] += S->buflen;
     S->t[1] += ( S->t[0] < S->buflen );
     if ( S->last_node )  S->f[1] = ~0U;
@@ -251,7 +251,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
     blake2s_4way_compress( S, buf );
 
     for ( int i = 0; i < 8; ++i )
-      casti_m128i( out, i ) = S->h[ i ];
+      casti_v128( out, i ) = S->h[ i ];
     return 0;
 }
 
@@ -417,7 +417,7 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
 {
   __m256i *input = (__m256i*)in;
   __m256i *buf = (__m256i*)S->buf;
-  const int bsize = BLAKE2S_BLOCKBYTES;
+  const int bsize = 64;
 
    while( inlen > 0 )
    {
@@ -426,8 +426,8 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
       {
          memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
          S->buflen += bsize - left;
-         S->t[0] += BLAKE2S_BLOCKBYTES;
-         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         S->t[0] += 64;
+         S->t[1] += ( S->t[0] < 64 );
          blake2s_8way_compress( S, buf );
          S->buflen = 0;
          input += ( bsize >> 2 );
@@ -454,8 +454,7 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
       S->f[1] = ~0U;
    S->f[0] = ~0U;
 
-   memset_zero_256( buf + ( S->buflen>>2 ),
-                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   memset_zero_256( buf + ( S->buflen>>2 ),( 64 - S->buflen ) >> 2 );
    blake2s_8way_compress( S, buf );
 
    for ( int i = 0; i < 8; ++i )
@@ -470,21 +469,21 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
     __m256i *in = (__m256i*)input;
     __m256i *buf = (__m256i*)S->buf;
 
-    while( inlen > BLAKE2S_BLOCKBYTES )
+    while( inlen > 64 )
     {
-       memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-       S->buflen = BLAKE2S_BLOCKBYTES;
-       inlen -= BLAKE2S_BLOCKBYTES;
-       S->t[0] += BLAKE2S_BLOCKBYTES;
-       S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+       memcpy_256( buf, in, 64 >> 2 );
+       S->buflen = 64;
+       inlen -= 64;
+       S->t[0] += 64;
+       S->t[1] += ( S->t[0] < 64 );
        blake2s_8way_compress( S, buf );
        S->buflen = 0;
-       in += ( BLAKE2S_BLOCKBYTES >> 2 );
+       in += ( 64 >> 2 );
     }
 
     // last block
-    memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
-    S->buflen = BLAKE2S_BLOCKBYTES;
+    memcpy_256( buf, in, 64 >> 2 );
+    S->buflen = 64;
     S->t[0] += S->buflen;
     S->t[1] += ( S->t[0] < S->buflen );
     if ( S->last_node )  S->f[1] = ~0U;
@@ -611,7 +610,7 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
 {
   __m512i *input = (__m512i*)in;
   __m512i *buf = (__m512i*)S->buf;
-  const int bsize = BLAKE2S_BLOCKBYTES;
+  const int bsize = 64;
 
    while( inlen > 0 )
    {
@@ -620,8 +619,8 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
       {
          memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
          S->buflen += bsize - left;
-         S->t[0] += BLAKE2S_BLOCKBYTES;
-         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         S->t[0] += 64;
+         S->t[1] += ( S->t[0] < 64 );
          blake2s_16way_compress( S, buf );
          S->buflen = 0;
          input += ( bsize >> 2 );
@@ -649,7 +648,7 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
    S->f[0] = ~0U;
 
    memset_zero_512( buf + ( S->buflen>>2 ),
-                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+                    ( 64 - S->buflen ) >> 2 );
    blake2s_16way_compress( S, buf );
 
    for ( int i = 0; i < 8; ++i )
diff --git a/algo/blake/blake2s-hash.h b/algo/blake/blake2s-hash.h
index fc86c4f..2764a89 100644
--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -14,7 +14,7 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1
 
-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
 
 #include "simd-utils.h"
 
@@ -29,41 +29,25 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif
 
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-enum blake2s_constant
-{
-   BLAKE2S_BLOCKBYTES = 64,
-   BLAKE2S_OUTBYTES   = 32,
-   BLAKE2S_KEYBYTES   = 32,
-   BLAKE2S_SALTBYTES  = 8,
-   BLAKE2S_PERSONALBYTES = 8
-};
-
-#pragma pack(push, 1)
-typedef struct __blake2s_nway_param
-{
-   uint8_t  digest_length; // 1
-   uint8_t  key_length;    // 2
-   uint8_t  fanout;        // 3
-   uint8_t  depth;         // 4
-   uint32_t leaf_length;   // 8
-   uint8_t  node_offset[6];// 14
-   uint8_t  node_depth;    // 15
-   uint8_t  inner_length;  // 16
-   // uint8_t  reserved[0];
-   uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
-   uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
-} blake2s_nway_param;
-#pragma pack(pop)
+   typedef struct __blake2s_nway_param
+   {
+      uint8_t  digest_length; // 1
+      uint8_t  key_length;    // 2
+      uint8_t  fanout;        // 3
+      uint8_t  depth;         // 4
+      uint32_t leaf_length;   // 8
+      uint8_t  node_offset[6];// 14
+      uint8_t  node_depth;    // 15
+      uint8_t  inner_length;  // 16
+      // uint8_t  reserved[0];
+      uint8_t  salt[8]; // 24
+      uint8_t  personal[8];  // 32
+   } blake2s_nway_param;
 
 typedef struct ALIGN( 64 ) __blake2s_4way_state
 {
-   __m128i h[8];
-   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
+   v128_t h[8];
+   uint8_t  buf[ 64 * 4 ];
    uint32_t t[2];
    uint32_t f[2];
    size_t   buflen;
@@ -83,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
 typedef struct ALIGN( 64 ) __blake2s_8way_state
 {
    __m256i h[8];
-   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
+   uint8_t  buf[ 32 * 8 ];
    uint32_t t[2];
    uint32_t f[2];
    size_t   buflen;
@@ -104,7 +88,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
 typedef struct ALIGN( 64 ) __blake2s_16way_state
 {
    __m512i h[8];
-   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 16 ];
+   uint8_t  buf[ 32 * 16 ];
    uint32_t t[2];
    uint32_t f[2];
    size_t   buflen;
@@ -127,10 +111,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
 	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
 #endif
 
-#if defined(__cplusplus)
-}
-#endif
-
 #endif  // __SSE2__
 
 #endif
diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c
index 0641117..a146ea2 100644
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -20,7 +20,7 @@ void blake2s_16way_hash( void *output, const void *input )
    blake2s_16way_state ctx;
    memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
    blake2s_16way_update( &ctx, input + (64<<4), 16 );
-   blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
+   blake2s_16way_final( &ctx, output, 32 );
 }
 
 int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
@@ -39,7 +39,7 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
    int thr_id = mythr->id;  
 
    mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_16way_init( &blake2s_16w_ctx, 32 );
    blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
 
    do {
@@ -76,7 +76,7 @@ void blake2s_8way_hash( void *output, const void *input )
    blake2s_8way_state ctx;
    memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
    blake2s_8way_update( &ctx, input + (64<<3), 16 );
-   blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
+   blake2s_8way_final( &ctx, output, 32 );
 }
 
 int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
@@ -95,7 +95,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
    int thr_id = mythr->id; 
 
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_8way_init( &blake2s_8w_ctx, 32 );
    blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
 
    do {
@@ -131,7 +131,7 @@ void blake2s_4way_hash( void *output, const void *input )
    blake2s_4way_state ctx;
    memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
    blake2s_4way_update( &ctx, input + (64<<2), 16 );
-   blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
+   blake2s_4way_final( &ctx, output, 32 );
 }
 
 int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
@@ -149,8 +149,8 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
    uint32_t n = first_nonce;
    int thr_id = mythr->id; 
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
+   blake2s_4way_init( &blake2s_4w_ctx, 32 );
    blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
 
    do {
@@ -183,12 +183,12 @@ static __thread blake2s_state blake2s_ctx;
 
 void blake2s_hash( void *output, const void *input )
 {
-   unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
+   unsigned char _ALIGN(32) hash[32];
    blake2s_state ctx __attribute__ ((aligned (32)));
 
    memcpy( &ctx, &blake2s_ctx, sizeof ctx );
    blake2s_update( &ctx, input+64, 16 );
-   blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
+   blake2s_final( &ctx, hash, 32 );
 
    memcpy(output, hash, 32);
 }
@@ -201,14 +201,13 @@ int scanhash_blake2s( struct work *work,uint32_t max_nonce,
    uint32_t _ALIGN(32) hash32[8];
    uint32_t _ALIGN(32) endiandata[20];
    const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
 
-   mm128_bswap32_80( endiandata, pdata );
+   v128_bswap32_80( endiandata, pdata );
 
    // midstate
-   blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
+   blake2s_init( &blake2s_ctx, 32 );
    blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
 
    do
diff --git a/algo/blake/blake512-hash.c b/algo/blake/blake512-hash.c
index 8987ce6..49c90f9 100644
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -343,52 +343,52 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
 
 #define BLAKE512_G( r,  Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
 { \
-   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                            _mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
+   Va = v128_add64( Va, v128_add64( Vb, \
+                            v128_set_64( CBx( r, Sd ) ^ Mx( r, Sc ), \
                                             CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
-   Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
-   Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
+   Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
 \
-   Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
-                            _mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
+   Va = v128_add64( Va, v128_add64( Vb, \
+                            v128_set_64( CBx( r, Sc ) ^ Mx( r, Sd ), \
                                             CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
-   Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
-   Vc = _mm_add_epi64( Vc, Vd ); \
-   Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
+   Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
 }
 
 #define BLAKE512_ROUND( R ) \
 { \
-   __m128i V32, V23, V67, V76; \
+   v128_t V32, V23, V67, V76; \
    BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
    BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
-   V32 = mm128_alignr_64( V[3], V[2], 1 ); \
-   V23 = mm128_alignr_64( V[2], V[3], 1 ); \
-   V67 = mm128_alignr_64( V[6], V[7], 1 ); \
-   V76 = mm128_alignr_64( V[7], V[6], 1 ); \
+   V32 = v128_alignr64( V[3], V[2], 1 ); \
+   V23 = v128_alignr64( V[2], V[3], 1 ); \
+   V67 = v128_alignr64( V[6], V[7], 1 ); \
+   V76 = v128_alignr64( V[7], V[6], 1 ); \
    BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
    BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
-   V[2] = mm128_alignr_64( V32, V23, 1 ); \
-   V[3] = mm128_alignr_64( V23, V32, 1 ); \
-   V[6] = mm128_alignr_64( V76, V67, 1 ); \
-   V[7] = mm128_alignr_64( V67, V76, 1 ); \
+   V[2] = v128_alignr64( V32, V23, 1 ); \
+   V[3] = v128_alignr64( V23, V32, 1 ); \
+   V[6] = v128_alignr64( V76, V67, 1 ); \
+   V[7] = v128_alignr64( V67, V76, 1 ); \
 }
 
 void blake512_transform( uint64_t *H, const uint64_t *buf,
                          const uint64_t T0, const uint64_t T1 )
 {
-   __m128i V[8];
+   v128_t V[8];
    uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
 
-   V[0] = casti_m128i( H, 0 );
-   V[1] = casti_m128i( H, 1 );
-   V[2] = casti_m128i( H, 2 );
-   V[3] = casti_m128i( H, 3 );
-   V[4] = _mm_set_epi64x( CB1, CB0 );
-   V[5] = _mm_set_epi64x( CB3, CB2 );
-   V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
-   V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
+   V[0] = casti_v128( H, 0 );
+   V[1] = casti_v128( H, 1 );
+   V[2] = casti_v128( H, 2 );
+   V[3] = casti_v128( H, 3 );
+   V[4] = v128_set_64( CB1, CB0 );
+   V[5] = v128_set_64( CB3, CB2 );
+   V[6] = v128_set_64( T0 ^ CB5, T0 ^ CB4 );
+   V[7] = v128_set_64( T1 ^ CB7, T1 ^ CB6 );
 
    M0 = bswap_64( buf[ 0] );
    M1 = bswap_64( buf[ 1] );
@@ -424,10 +424,10 @@ void blake512_transform( uint64_t *H, const uint64_t *buf,
    BLAKE512_ROUND( 4 );
    BLAKE512_ROUND( 5 );
 
-   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
-   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
-   casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
-   casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
+   casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V[0], V[4] ) );
+   casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V[1], V[5] ) );
+   casti_v128( H, 2 ) = v128_xor( casti_v128( H, 2 ), v128_xor( V[2], V[6] ) );
+   casti_v128( H, 3 ) = v128_xor( casti_v128( H, 3 ), v128_xor( V[3], V[7] ) );
 }
 
 #endif
@@ -611,7 +611,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
   VD = v512_64( T0 ^ CB5 ); \
   VE = v512_64( T1 ^ CB6 ); \
   VF = v512_64( T1 ^ CB7 ); \
-  const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
+  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( \
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
   M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -679,7 +679,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
   VE = v512_64( sc->T1 ^ CB6 );
   VF = v512_64( sc->T1 ^ CB7 );
 
-  const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( 
+  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( 
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
 
   M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
@@ -1347,7 +1347,7 @@ blake512_8way_close(void *cc, void *dst)
   VD = v256_64( T0 ^ CB5 ); \
   VE = v256_64( T1 ^ CB6 ); \
   VF = v256_64( T1 ^ CB7 ); \
-  const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
+  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64( \
                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
   M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -1419,7 +1419,7 @@ void blake512_4way_compress( blake_4way_big_context *sc )
                              v256_64( CB6 ) );
   VF = _mm256_xor_si256( v256_64( sc->T1 ),
                              v256_64( CB7 ) );
-  const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
+  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64(
                                     0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
 
   M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c
index 9ff0199..6ebd75d 100644
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -177,7 +177,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
    if ( opt_benchmark )
       HTarget = 0x7f;
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
    blake256r8_4way_init( &blakecoin_4w_ctx );
    blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
 
diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c
index 72cb415..1064d69 100644
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -118,15 +118,15 @@ static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_
 	return 0;
 }
 
-static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
+static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[8] )
 {
-	memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
+	memcpy( P->salt, salt, 8 );
 	return 0;
 }
 
-static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
+static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[8] )
 {
-	memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
+	memcpy( P->personal, personal, 8 );
 	return 0;
 }
 
@@ -159,7 +159,7 @@ int blake2s_init( blake2s_state *S, const uint8_t outlen )
 	blake2s_param P[1];
 
 	/* Move interval verification here? */
-	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+	if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
 
 	P->digest_length = outlen;
 	P->key_length    = 0;
@@ -179,9 +179,9 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
 {
 	blake2s_param P[1];
 
-	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+	if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
 
-	if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+	if ( !key || !keylen || keylen > 8 ) return -1;
 
 	P->digest_length = outlen;
 	P->key_length    = keylen;
@@ -198,16 +198,16 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
 	if( blake2s_init_param( S, P ) < 0 ) return -1;
 
 	{
-		uint8_t block[BLAKE2S_BLOCKBYTES];
-		memset( block, 0, BLAKE2S_BLOCKBYTES );
+		uint8_t block[64];
+		memset( block, 0, 64 );
 		memcpy( block, key, keylen );
-		blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
-		secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+		blake2s_update( S, block, 64 );
+		secure_zero_memory( block, 64 ); /* Burn the key from stack */
 	}
 	return 0;
 }
 
-int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 {
 	uint32_t _ALIGN(32) m[16];
 	uint32_t _ALIGN(32) v[16];
@@ -329,16 +329,16 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
 	while( inlen > 0 )
 	{
 		size_t left = S->buflen;
-		size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
+		size_t fill = 2 * 64 - left;
 
 		if( inlen > fill )
 		{
 			memcpy( S->buf + left, in, fill ); // Fill buffer
 			S->buflen += fill;
-			blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+			blake2s_increment_counter( S, 64 );
 			blake2s_compress( S, S->buf ); // Compress
-			memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
-			S->buflen -= BLAKE2S_BLOCKBYTES;
+			memcpy( S->buf, S->buf + 64, 64 ); // Shift buffer left
+			S->buflen -= 64;
 			in += fill;
 			inlen -= fill;
 		}
@@ -356,19 +356,19 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
 
 int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
 {
-	uint8_t buffer[BLAKE2S_OUTBYTES];
+	uint8_t buffer[32];
 
-	if( S->buflen > BLAKE2S_BLOCKBYTES )
+	if( S->buflen > 64 )
 	{
-		blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+		blake2s_increment_counter( S, 64 );
 		blake2s_compress( S, S->buf );
-		S->buflen -= BLAKE2S_BLOCKBYTES;
-		memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
+		S->buflen -= 64;
+		memcpy( S->buf, S->buf + 64, S->buflen );
 	}
 
 	blake2s_increment_counter( S, ( uint32_t )S->buflen );
 	blake2s_set_lastblock( S );
-	memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+	memset( S->buf + S->buflen, 0, 2 * 64 - S->buflen ); /* Padding */
 	blake2s_compress( S, S->buf );
 
 	for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
@@ -408,10 +408,10 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
 #include "blake2-kat.h" /* test data not included */
 int main( int argc, char **argv )
 {
-	uint8_t key[BLAKE2S_KEYBYTES];
+	uint8_t key[8];
 	uint8_t buf[KAT_LENGTH];
 
-	for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
+	for( size_t i = 0; i < 8; ++i )
 		key[i] = ( uint8_t )i;
 
 	for( size_t i = 0; i < KAT_LENGTH; ++i )
@@ -419,10 +419,10 @@ int main( int argc, char **argv )
 
 	for( size_t i = 0; i < KAT_LENGTH; ++i )
 	{
-		uint8_t hash[BLAKE2S_OUTBYTES];
-		blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
+		uint8_t hash[32];
+		blake2s( hash, buf, key, 32, i, );
 
-		if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+		if( 0 != memcmp( hash, blake2s_keyed_kat[i], 32 ) )
 		{
 			puts( "error" );
 			return -1;
diff --git a/algo/blake/sph-blake2s.h b/algo/blake/sph-blake2s.h
index eb66b7a..e8aa93c 100644
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
@@ -87,19 +87,6 @@ static inline void secure_zero_memory(void *v, size_t n)
 
 /* blake2.h */
 
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-	enum blake2s_constant
-	{
-		BLAKE2S_BLOCKBYTES = 64,
-		BLAKE2S_OUTBYTES   = 32,
-		BLAKE2S_KEYBYTES   = 32,
-		BLAKE2S_SALTBYTES  = 8,
-		BLAKE2S_PERSONALBYTES = 8
-	};
-
 #pragma pack(push, 1)
 	typedef struct __blake2s_param
 	{
@@ -112,8 +99,8 @@ extern "C" {
 		uint8_t  node_depth;    // 15
 		uint8_t  inner_length;  // 16
 		// uint8_t  reserved[0];
-		uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
-		uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+		uint8_t  salt[8]; // 24
+		uint8_t  personal[8];  // 32
 	} blake2s_param;
 
 	typedef struct ALIGN( 64 ) __blake2s_state
@@ -121,13 +108,13 @@ extern "C" {
 		uint32_t h[8];
 		uint32_t t[2];
 		uint32_t f[2];
-		uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+		uint8_t  buf[2 * 64];
 		size_t   buflen;
 		uint8_t  last_node;
 	} blake2s_state ;
 #pragma pack(pop)
 
-	int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
+	int blake2s_compress( blake2s_state *S, const uint8_t block[64] );
 
 	// Streaming API
 	int blake2s_init( blake2s_state *S, const uint8_t outlen );
diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c
index 19c7319..29f7677 100644
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -95,6 +95,43 @@
 }
 */
 
+#elif defined(__SSE2__) || defined(__NEON__)   // ready for NEON
+
+#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
+{ \
+   Va = v128_add64( Va, v128_add64( Vb, \
+                 v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+   Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \
+\
+   Va = v128_add64( Va, v128_add64( Vb, \
+                 v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+   Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
+   Vc = v128_add64( Vc, Vd ); \
+   Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
+}
+
+#define BLAKE2B_ROUND( R ) \
+{ \
+   __m128i *V = (__m128i*)v; \
+   __m128i V2, V3, V6, V7; \
+   const uint8_t *sigmaR = sigma[R]; \
+   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
+   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
+   V2 = v128_alignr64( V[3], V[2], 1 ); \
+   V3 = v128_alignr64( V[2], V[3], 1 ); \
+   V6 = v128_alignr64( V[6], V[7], 1 ); \
+   V7 = v128_alignr64( V[7], V[6], 1 ); \
+   BLAKE2B_G( V[0], V2, V[5], V6,  8,  9, 10, 11 ); \
+   BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
+   V[2] = v128_alignr64( V2, V3, 1 ); \
+   V[3] = v128_alignr64( V3, V2, 1 ); \
+   V[6] = v128_alignr64( V7, V6, 1 ); \
+   V[7] = v128_alignr64( V6, V7, 1 ); \
+}
+
+/*
 #elif defined(__SSE2__)
 // always true
 
@@ -131,6 +168,7 @@
    V[6] = mm128_alignr_64( V7, V6, 1 ); \
    V[7] = mm128_alignr_64( V6, V7, 1 ); \
 }
+*/
 
 #else
 // never used, SSE2 is always available
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index 7f62099..ddcaef4 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -1,13 +1,6 @@
 /* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
 #define CUBEHASH_ROUNDS	16
 #define CUBEHASH_BLOCKBYTES 32
-#define OPTIMIZE_SSE2
-#if defined(OPTIMIZE_SSE2)
-#include <emmintrin.h>
-#endif
-#ifdef __AVX2__
-#include <immintrin.h>
-#endif
 #include "cubehash_sse2.h"
 #include <stdbool.h>
 #include <unistd.h>
@@ -80,70 +73,73 @@ static void transform( cubehashParam *sp )
     _mm256_store_si256( (__m256i*)sp->x + 2, x2 );
     _mm256_store_si256( (__m256i*)sp->x + 3, x3 );
 
-#else
-    __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+#else   // AVX, SSE2, NEON
 
-    x0 = _mm_load_si128( (__m128i*)sp->x     );
-    x1 = _mm_load_si128( (__m128i*)sp->x + 1 );
-    x2 = _mm_load_si128( (__m128i*)sp->x + 2 );
-    x3 = _mm_load_si128( (__m128i*)sp->x + 3 );
-    x4 = _mm_load_si128( (__m128i*)sp->x + 4 );
-    x5 = _mm_load_si128( (__m128i*)sp->x + 5 );
-    x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
-    x7 = _mm_load_si128( (__m128i*)sp->x + 7 );
+#pragma message "NEON for Cubehash"
+
+    v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+
+    x0 = casti_v128( sp->x, 0 );
+    x1 = casti_v128( sp->x, 1 );
+    x2 = casti_v128( sp->x, 2 );
+    x3 = casti_v128( sp->x, 3 );
+    x4 = casti_v128( sp->x, 4 );
+    x5 = casti_v128( sp->x, 5 );
+    x6 = casti_v128( sp->x, 6 );
+    x7 = casti_v128( sp->x, 7 );
 
     for ( r = 0; r < rounds; ++r )
     {
-       x4 = _mm_add_epi32( x0, x4 );
-       x5 = _mm_add_epi32( x1, x5 );
-       x6 = _mm_add_epi32( x2, x6 );
-       x7 = _mm_add_epi32( x3, x7 );
+       x4 = v128_add32( x0, x4 );
+       x5 = v128_add32( x1, x5 );
+       x6 = v128_add32( x2, x6 );
+       x7 = v128_add32( x3, x7 );
        y0 = x2;
        y1 = x3;
        y2 = x0;
        y3 = x1;
-       x0 = mm128_rol_32( y0, 7 );
-       x1 = mm128_rol_32( y1, 7 );
-       x2 = mm128_rol_32( y2, 7 );
-       x3 = mm128_rol_32( y3, 7 );
-       x0 = _mm_xor_si128( x0, x4 );
-       x1 = _mm_xor_si128( x1, x5 );
-       x2 = _mm_xor_si128( x2, x6 );
-       x3 = _mm_xor_si128( x3, x7 );
-       x4 = _mm_shuffle_epi32( x4, 0x4e );
-       x5 = _mm_shuffle_epi32( x5, 0x4e );
-       x6 = _mm_shuffle_epi32( x6, 0x4e );
-       x7 = _mm_shuffle_epi32( x7, 0x4e );
-       x4 = _mm_add_epi32( x0, x4 );
-       x5 = _mm_add_epi32( x1, x5 );
-       x6 = _mm_add_epi32( x2, x6 );
-       x7 = _mm_add_epi32( x3, x7 );
+       x0 = v128_rol32( y0, 7 );
+       x1 = v128_rol32( y1, 7 );
+       x2 = v128_rol32( y2, 7 );
+       x3 = v128_rol32( y3, 7 );
+       x0 = v128_xor( x0, x4 );
+       x1 = v128_xor( x1, x5 );
+       x2 = v128_xor( x2, x6 );
+       x3 = v128_xor( x3, x7 );
+       x4 = v128_swap64( x4 );
+       x5 = v128_swap64( x5 );
+       x6 = v128_swap64( x6 );
+       x7 = v128_swap64( x7 );
+       x4 = v128_add32( x0, x4 );
+       x5 = v128_add32( x1, x5 );
+       x6 = v128_add32( x2, x6 );
+       x7 = v128_add32( x3, x7 );
        y0 = x1;
        y1 = x0;
        y2 = x3;
        y3 = x2;
-       x0 = mm128_rol_32( y0, 11 );
-       x1 = mm128_rol_32( y1, 11 );
-       x2 = mm128_rol_32( y2, 11 );
-       x3 = mm128_rol_32( y3, 11 );
-	    x0 = _mm_xor_si128( x0, x4 );
-	    x1 = _mm_xor_si128( x1, x5 );
-	    x2 = _mm_xor_si128( x2, x6 );
-	    x3 = _mm_xor_si128( x3, x7 );
-	    x4 = _mm_shuffle_epi32( x4, 0xb1 );
-	    x5 = _mm_shuffle_epi32( x5, 0xb1 );
-	    x6 = _mm_shuffle_epi32( x6, 0xb1 );
-	    x7 = _mm_shuffle_epi32( x7, 0xb1 );
+       x0 = v128_rol32( y0, 11 );
+       x1 = v128_rol32( y1, 11 );
+       x2 = v128_rol32( y2, 11 );
+       x3 = v128_rol32( y3, 11 );
+	    x0 = v128_xor( x0, x4 );
+	    x1 = v128_xor( x1, x5 );
+	    x2 = v128_xor( x2, x6 );
+	    x3 = v128_xor( x3, x7 );
+	    x4 = v128_swap64_32( x4 );
+	    x5 = v128_swap64_32( x5 );
+	    x6 = v128_swap64_32( x6 );
+	    x7 = v128_swap64_32( x7 );
     }
 
-    _mm_store_si128( (__m128i*)sp->x,     x0 );
-    _mm_store_si128( (__m128i*)sp->x + 1, x1 );
-    _mm_store_si128( (__m128i*)sp->x + 2, x2 );
-    _mm_store_si128( (__m128i*)sp->x + 3, x3 );
-    _mm_store_si128( (__m128i*)sp->x + 4, x4 );
-    _mm_store_si128( (__m128i*)sp->x + 5, x5 );
-    _mm_store_si128( (__m128i*)sp->x + 6, x6 );
-    _mm_store_si128( (__m128i*)sp->x + 7, x7 );
+    casti_v128( sp->x, 0 ) = x0;
+    casti_v128( sp->x, 1 ) = x1;
+    casti_v128( sp->x, 2 ) = x2;
+    casti_v128( sp->x, 3 ) = x3;
+    casti_v128( sp->x, 4 ) = x4;
+    casti_v128( sp->x, 5 ) = x5;
+    casti_v128( sp->x, 6 ) = x6;
+    casti_v128( sp->x, 7 ) = x7;
 
 #endif
 }  // transform
@@ -170,7 +166,7 @@ static const uint64_t IV512[] =
 
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
-    __m128i *x = (__m128i*)sp->x;
+    v128_t *x = (v128_t*)sp->x;
     sp->hashlen   = hashbitlen/128;
     sp->blocksize = blockbytes/16;
     sp->rounds    = rounds;
@@ -179,34 +175,34 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
     if ( hashbitlen == 512 )
     {
 
-       x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
     }
     else
     {
-       x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
     }   
 
-    return SUCCESS;
+    return 0;
 }
 
-int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
+int cubehashUpdate( cubehashParam *sp, const void *data, size_t size )
 {
     const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
+    const v128_t* in = (v128_t*)data;
     int i;
 
     // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -214,7 +210,7 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
 
     for ( i = 0; i < len; i++ )
     {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
         sp->pos++;
         if ( sp->pos == sp->blocksize )
         {
@@ -223,20 +219,20 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
         }
     }
 
-    return SUCCESS;
+    return 0;
 }
 
-int cubehashDigest( cubehashParam *sp, byte *digest )
+int cubehashDigest( cubehashParam *sp, void *digest )
 {
-    __m128i* hash = (__m128i*)digest;
+    v128_t* hash = (v128_t*)digest;
     int i;
 
     // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi64x( 0, 0x80 ) );
+    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
+                                      v128_set_64( 0, 0x80 ) );
     transform( sp );
 
-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
     transform( sp );
     transform( sp );
     transform( sp );
@@ -251,15 +247,15 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
     for ( i = 0; i < sp->hashlen; i++ )
        hash[i] = sp->x[i];
 
-    return SUCCESS;
+    return 0;
 }
 
-int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
-                          const byte *data, size_t size )
+int cubehashUpdateDigest( cubehashParam *sp, void *digest,
+                          const void *data, size_t size )
 {
     const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    __m128i* hash = (__m128i*)digest;
+    const v128_t* in = (v128_t*)data;
+    v128_t* hash = (v128_t*)digest;
     int i;
 
     // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -267,7 +263,7 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
 
     for ( i = 0; i < len; i++ )
     {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
         sp->pos++;
         if ( sp->pos == sp->blocksize )
         {
@@ -277,11 +273,11 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
     }
 
     // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi64x( 0, 0x80 ) );
+    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
+                                      v128_set_64( 0, 0x80 ) );
     transform( sp );
 
-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
 
     transform( sp );
     transform( sp );
@@ -297,13 +293,13 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
     for ( i = 0; i < sp->hashlen; i++ )
        hash[i] = sp->x[i];
 
-    return SUCCESS;
+    return 0;
 }
 
-int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
-                          const byte *data, size_t size )
+int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen,
+                          const void *data, size_t size )
 {
-    __m128i *x = (__m128i*)sp->x;
+    v128_t *x = (v128_t*)sp->x;
     sp->hashlen   = hashbitlen/128;
     sp->blocksize = 32/16;
     sp->rounds    = 16;
@@ -312,33 +308,33 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
     if ( hashbitlen == 512 )
     {
 
-       x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
     }
     else
     {
-       x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
     }
 
 
 
 
     const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    __m128i* hash = (__m128i*)digest;
+    const v128_t* in = (v128_t*)data;
+    v128_t* hash = (v128_t*)digest;
     int i;
 
     // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -346,7 +342,7 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
 
     for ( i = 0; i < len; i++ )
     {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
         sp->pos++;
         if ( sp->pos == sp->blocksize )
         {
@@ -356,11 +352,11 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
     }
 
     // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      _mm_set_epi64x( 0, 0x80 ) );
+    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
+                                      v128_set_64( 0, 0x80 ) );
     transform( sp );
 
-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
 
     transform( sp );
     transform( sp );
@@ -376,6 +372,6 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
     for ( i = 0; i < sp->hashlen; i++ )
        hash[i] = sp->x[i];
 
-    return SUCCESS;
+    return 0;
 }
 
diff --git a/algo/cubehash/cubehash_sse2.h b/algo/cubehash/cubehash_sse2.h
index 5b69ac7..670d0b2 100644
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -3,11 +3,7 @@
 
 #include "compat.h"
 #include <stdint.h>
-#include "compat/sha3-defs.h"
-
-#define	OPTIMIZE_SSE2
-
-#include <emmintrin.h>
+#include "simd-utils.h"
 
 /*!\brief Holds all the parameters necessary for the CUBEHASH algorithm.
  * \ingroup HASH_cubehash_m
@@ -15,7 +11,7 @@
 
 struct _cubehashParam
 {
-    __m128i _ALIGN(64) x[8];  // aligned for __m512i
+    v128_t _ALIGN(64) x[8];  // aligned for __m512i
     int hashlen;           // __m128i
     int rounds;
     int blocksize;         // __m128i
@@ -32,15 +28,15 @@ int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
 // reinitialize context with same parameters, much faster.
 int cubehashReinit( cubehashParam* sp );
 
-int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);
+int cubehashUpdate(cubehashParam* sp, const void *data, size_t size);
 
-int cubehashDigest(cubehashParam* sp, byte *digest);
+int cubehashDigest(cubehashParam* sp, void *digest);
 
-int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
-                          size_t size );
+int cubehashUpdateDigest( cubehashParam *sp, void *digest,
+                          const void *data, size_t size );
 
-int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
-                   const byte *data, size_t size );
+int cubehash_full( cubehashParam* sp, void *digest, int hashbitlen,
+                   const void *data, size_t size );
 
 #ifdef __cplusplus
 }
diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c
index 605508f..2a0c5a5 100644
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -13,6 +13,9 @@
  * Institute of Applied Mathematics, Middle East Technical University, Turkey.
  *
  */
+
+//TODO NEON support, funky shuffles
+
 #if defined(__AES__)
 
 #include <memory.h>
diff --git a/algo/echo/aes_ni/hash_api.h b/algo/echo/aes_ni/hash_api.h
index 816d457..b961fe6 100644
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -24,16 +24,16 @@
 
 #include "compat/sha3_common.h"
 
-#include <emmintrin.h>
+#include "simd-utils.h"
 
 
 typedef struct
 {
-	__m128i			state[4][4];
+	v128_t			state[4][4];
         BitSequence             buffer[192];
-	__m128i			k;
-	__m128i			hashsize;
-	__m128i			const1536;
+	v128_t			k;
+	v128_t			hashsize;
+	v128_t			const1536;
 
 	unsigned int	uRounds;
 	unsigned int	uHashSize;
diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h
index 558215a..bd8d8d0 100644
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -9,13 +9,12 @@
 #ifndef __hash_h
 #define __hash_h
 
-#include <immintrin.h>
-
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
 #endif
 #include <stdlib.h>
+#include "simd-utils.h"
 
 #define LENGTH (512)
 
@@ -67,8 +66,8 @@ typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr
 #define SIZE512 (SIZE_1024/16)
 
 typedef struct {
-  __attribute__ ((aligned (64))) __m128i chaining[SIZE512];
-  __attribute__ ((aligned (64))) __m128i buffer[SIZE512];
+  __attribute__ ((aligned (64))) v128_t chaining[SIZE512];
+  __attribute__ ((aligned (64))) v128_t buffer[SIZE512];
   int hashlen;       // byte
   int blk_count;     // SIZE_m128i
   int buf_ptr;       // __m128i offset
diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h
index 24544a5..7d77805 100644
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -9,7 +9,7 @@
 #ifndef __hash_h
 #define __hash_h
 
-#include <immintrin.h>
+#include "simd-utils.h"
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
@@ -91,8 +91,8 @@ typedef enum
 #define SIZE256 (SIZE_512/16)
 
 typedef struct {
-  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
-  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
+  __attribute__ ((aligned (32))) v128_t chaining[SIZE256];
+  __attribute__ ((aligned (32))) v128_t buffer[SIZE256];
   int hashlen;              // bytes
   int blk_count;
   int buf_ptr;              /* data buffer pointer */
diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h
index 05ddccb..1439ef1 100644
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -10,7 +10,6 @@
 #define GROESTL256_HASH_4WAY_H__ 1
 
 #include "simd-utils.h"
-#include <immintrin.h>
 #include <stdint.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h
index 7025428..9cd3e82 100644
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -2,7 +2,6 @@
 #define GROESTL512_HASH_4WAY_H__ 1
 
 #include "simd-utils.h"
-#include <immintrin.h>
 #include <stdint.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c
index 0b13ad2..856a7fc 100644
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -211,7 +211,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
    if ( opt_benchmark )
       ( (uint32_t*)ptarget )[7] = 0x0000ff;
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
    do {
       *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
 
diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c
index 9e94423..ece87ac 100644
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -41,7 +41,7 @@ static void
 SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
 ( haval_4way_context *sc, const void *data, size_t len )
 {
-   __m128i *vdata = (__m128i*)data;
+   v128_t *vdata = (v128_t*)data;
    unsigned current;
 
    current = (unsigned)sc->count_low & 127U;
@@ -53,7 +53,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
       clen = 128U - current;
       if ( clen > len )
          clen = len;
-      memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
+      v128_memcpy( sc->buf + (current>>2), vdata, clen>>2 );
       vdata += clen>>2;
       current += clen;
       len -= clen;
@@ -88,7 +88,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
    RSTATE;
    if ( current > 116UL )
    {
-      memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      v128_memset_zero( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
       do
       {
          IN_PREPARE(sc->buf);
@@ -98,12 +98,12 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
    }
 
    uint32_t t1, t2;
-   memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   v128_memset_zero( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
    t1 = 0x01 | (PASSES << 3);
    t2 = sc->olen << 3;
-   sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
-   sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
-   sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
+   sc->buf[ 116>>2 ] = v128_32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = v128_32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = v128_32( (sc->count_high << 3)
                                      | (sc->count_low >> 29) );
    do
    {
diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c
index b5abd63..bf1fca3 100644
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -38,11 +38,12 @@
 
 #include <stddef.h>
 #include <string.h>
+#include <stdint.h>
 #include "haval-hash-4way.h"
 
 // won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
 //#if defined (__SSE4_2__)
-#if defined(__AVX__)
+#if defined(__AVX__) || defined(__ARM_NEON)
 
 #ifdef __cplusplus
 extern "C"{
@@ -55,97 +56,97 @@ extern "C"{
 #if defined(__AVX512VL__)
 
 // ( ~( a ^ b ) ) & c
-#define mm128_andnotxor( a, b, c ) \
+#define v128_andnotxor( a, b, c ) \
    _mm_ternarylogic_epi32( a, b, c, 0x82  )
 
 #else
 
-#define mm128_andnotxor( a, b, c ) \
-   _mm_andnot_si128( _mm_xor_si128( a, b ), c )
+#define v128_andnotxor( a, b, c ) \
+   v128_andnot( v128_xor( a, b ), c )
 
 #endif
 
 #define F1(x6, x5, x4, x3, x2, x1, x0) \
- mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
-                 _mm_xor_si128( _mm_and_si128( x2, x5 ), \
-                                _mm_and_si128( x3, x6 ) ) ) \
+ v128_xor3( x0, v128_andxor( x1, x0, x4 ), \
+                 v128_xor( v128_and( x2, x5 ), \
+                                v128_and( x3, x6 ) ) ) \
 
 #define F2(x6, x5, x4, x3, x2, x1, x0) \
-   mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
-                       mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 )  ), \
-               mm128_andxor( x4, x1, x5 ), \
-               mm128_xorand( x0, x3, x5 ) ) \
+   v128_xor3( v128_andxor( x2, v128_andnot( x3, x1 ), \
+                       v128_xor3( v128_and( x4, x5 ), x6, x0 )  ), \
+               v128_andxor( x4, x1, x5 ), \
+               v128_xorand( x0, x3, x5 ) ) \
 
 #define F3(x6, x5, x4, x3, x2, x1, x0) \
-  mm128_xor3( x0, \
-              _mm_and_si128( x3, \
-                         mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
-              _mm_xor_si128( _mm_and_si128( x1, x4 ), \
-                             _mm_and_si128( x2, x5 ) ) )
+  v128_xor3( x0, \
+              v128_and( x3, \
+                         v128_xor3( v128_and( x1, x2 ), x6, x0 ) ), \
+              v128_xor( v128_and( x1, x4 ), \
+                             v128_and( x2, x5 ) ) )
 
 #define F4(x6, x5, x4, x3, x2, x1, x0) \
-  mm128_xor3( \
-      mm128_andxor( x3, x5, \
-                    _mm_xor_si128( _mm_and_si128( x1, x2 ), \
-                                      _mm_or_si128( x4, x6 ) ) ), \
-      _mm_and_si128( x4, \
-                        mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
-                                    _mm_xor_si128( x1, x6 ) ) ), \
-      mm128_xorand( x0, x2, x6 ) )
+  v128_xor3( \
+      v128_andxor( x3, x5, \
+                    v128_xor( v128_and( x1, x2 ), \
+                                      v128_or( x4, x6 ) ) ), \
+      v128_and( x4, \
+                        v128_xor3( x0, v128_andnot( x2, x5 ), \
+                                    v128_xor( x1, x6 ) ) ), \
+      v128_xorand( x0, x2, x6 ) )
 
 #define F5(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( \
-         mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
-         mm128_xor3( _mm_and_si128( x1, x4 ), \
-                     _mm_and_si128( x2, x5 ), \
-                     _mm_and_si128( x3, x6 ) ) )
+   v128_xor( \
+         v128_andnotxor( v128_and3( x1, x2, x3 ), x5, x0 ), \
+         v128_xor3( v128_and( x1, x4 ), \
+                     v128_and( x2, x5 ), \
+                     v128_and( x3, x6 ) ) )
   
 
 /*
 #define F1(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( x0, \
-       _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
-                      _mm_xor_si128( _mm_and_si128( x2, x5 ), \
-                                     _mm_and_si128( x3, x6 ) ) ) ) \
+   v128_xor( x0, \
+       v128_xor( v128_and(v128_xor( x0, x4 ), x1 ), \
+                      v128_xor( v128_and( x2, x5 ), \
+                                     v128_and( x3, x6 ) ) ) ) \
 
 #define F2(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( \
-      _mm_and_si128( x2, \
-         _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
-                        _mm_xor_si128( _mm_and_si128( x4, x5 ), \
-                                       _mm_xor_si128( x6, x0 ) ) ) ), \
-         _mm_xor_si128( \
-             _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
-             _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
+   v128_xor( \
+      v128_and( x2, \
+         v128_xor( v128_andnot( x3, x1 ), \
+                        v128_xor( v128_and( x4, x5 ), \
+                                       v128_xor( x6, x0 ) ) ) ), \
+         v128_xor( \
+             v128_and( x4, v128_xor( x1, x5 ) ), \
+             v128_xor( v128_and( x3, x5 ), x0 ) ) ) \
 
 #define F3(x6, x5, x4, x3, x2, x1, x0) \
-  _mm_xor_si128( \
-    _mm_and_si128( x3, \
-      _mm_xor_si128( _mm_and_si128( x1, x2 ), \
-                     _mm_xor_si128( x6, x0 ) ) ), \
-      _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
-                                   _mm_and_si128( x2, x5 ) ), x0 ) )
+  v128_xor( \
+    v128_and( x3, \
+      v128_xor( v128_and( x1, x2 ), \
+                     v128_xor( x6, x0 ) ) ), \
+      v128_xor( v128_xor(v128_and( x1, x4 ), \
+                                   v128_and( x2, x5 ) ), x0 ) )
 
 #define F4(x6, x5, x4, x3, x2, x1, x0) \
-  _mm_xor_si128( \
-     _mm_xor_si128( \
-        _mm_and_si128( x3, \
-           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
-                                         _mm_or_si128( x4, x6 ) ), x5 ) ), \
-        _mm_and_si128( x4, \
-           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \
-                          _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
-     _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
+  v128_xor( \
+     v128_xor( \
+        v128_and( x3, \
+           v128_xor( v128_xor( v128_and( x1, x2 ), \
+                                         v128_or( x4, x6 ) ), x5 ) ), \
+        v128_and( x4, \
+           v128_xor( v128_xor( v128_and( v128_not(x2), x5 ), \
+                          v128_xor( x1, x6 ) ), x0 ) ) ), \
+     v128_xor( v128_and( x2, x6 ), x0 ) )
 
 
 #define F5(x6, x5, x4, x3, x2, x1, x0) \
-   _mm_xor_si128( \
-       _mm_and_si128( x0, \
-            mm128_not( _mm_xor_si128( \
-                    _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
-      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
-                                    _mm_and_si128( x2, x5 ) ), \
-                                    _mm_and_si128( x3, x6 ) ) )
+   v128_xor( \
+       v128_and( x0, \
+            v128_not( v128_xor( \
+                    v128_and( v128_and( x1, x2 ), x3 ), x5 ) ) ), \
+      v128_xor( v128_xor( v128_and( x1, x4 ), \
+                                    v128_and( x2, x5 ) ), \
+                                    v128_and( x3, x6 ) ) )
 */
 
 /*
@@ -186,17 +187,17 @@ extern "C"{
  */
 #define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
 do { \
-   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
-   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
-                                      mm128_ror_32( x7, 11 ) ), \
-                       _mm_add_epi32( w, v128_32( c ) ) ); \
+   v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
+                                      v128_ror32( x7, 11 ) ), \
+                       v128_add32( w, v128_32( c ) ) ); \
 } while (0)
 
 #define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
 do { \
-   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
-   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
-                                      mm128_ror_32( x7, 11 ) ), w ); \
+   v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
+                                      v128_ror32( x7, 11 ) ), w ); \
 } while (0)
 
 /*
@@ -371,7 +372,7 @@ static const uint32_t RK5[32] = {
 };
 
 #define SAVE_STATE \
-   __m128i u0, u1, u2, u3, u4, u5, u6, u7; \
+   v128_t u0, u1, u2, u3, u4, u5, u6, u7; \
    do { \
       u0 = s0; \
       u1 = s1; \
@@ -385,14 +386,14 @@ static const uint32_t RK5[32] = {
 
 #define UPDATE_STATE \
 do { \
-   s0 = _mm_add_epi32( s0, u0 ); \
-   s1 = _mm_add_epi32( s1, u1 ); \
-   s2 = _mm_add_epi32( s2, u2 ); \
-   s3 = _mm_add_epi32( s3, u3 ); \
-   s4 = _mm_add_epi32( s4, u4 ); \
-   s5 = _mm_add_epi32( s5, u5 ); \
-   s6 = _mm_add_epi32( s6, u6 ); \
-   s7 = _mm_add_epi32( s7, u7 ); \
+   s0 = v128_add32( s0, u0 ); \
+   s1 = v128_add32( s1, u1 ); \
+   s2 = v128_add32( s2, u2 ); \
+   s3 = v128_add32( s3, u3 ); \
+   s4 = v128_add32( s4, u4 ); \
+   s5 = v128_add32( s5, u5 ); \
+   s6 = v128_add32( s6, u6 ); \
+   s7 = v128_add32( s7, u7 ); \
 } while (0)
 
 /*
@@ -431,7 +432,7 @@ do { \
 /*
  * DSTATE declares the state variables "s0" to "s7".
  */
-#define DSTATE   __m128i s0, s1, s2, s3, s4, s5, s6, s7
+#define DSTATE   v128_t s0, s1, s2, s3, s4, s5, s6, s7
 
 /*
  * RSTATE fills the state variables from the context "sc".
@@ -486,7 +487,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
 	
 }
 
-#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
+#define IN_PREPARE(indata) const v128_t *const load_ptr = (indata)
 
 #define INW(i)   load_ptr[ i ] 
 
@@ -497,7 +498,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
 static void
 haval_4way_out( haval_4way_context *sc, void *dst )
 {
-   __m128i *buf = (__m128i*)dst;
+   v128_t *buf = (v128_t*)dst;
    DSTATE;
    RSTATE;
 
diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h
index 271f2a8..db14188 100644
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -61,7 +61,7 @@
 #ifndef HAVAL_HASH_4WAY_H__
 #define HAVAL_HASH_4WAY_H__ 1
 
-#if defined(__AVX__)
+#if defined(__AVX__) || defined(__ARM_NEON)
 
 #ifdef __cplusplus
 extern "C"{
@@ -73,8 +73,8 @@ extern "C"{
 #define SPH_SIZE_haval256_5   256
 
 typedef struct {
-   __m128i buf[32];
-   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+   v128_t buf[32];
+   v128_t s0, s1, s2, s3, s4, s5, s6, s7;
    unsigned olen, passes;
    uint32_t count_high, count_low;
 } haval_4way_context;
diff --git a/algo/hodl/aes.c b/algo/hodl/aes.c
index 5be2af3..380adfd 100644
--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -1,10 +1,11 @@
 #include <stdint.h>
-#include <x86intrin.h>
-#include "wolf-aes.h"
 #include "miner.h"
 
 #if defined(__AES__)
 
+#include <x86intrin.h>
+#include "wolf-aes.h"
+
 static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
 {
     __m128i tmp4;
diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c
index ca94fd3..fa49afd 100644
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,4 +1,5 @@
 #include <memory.h>
+#include <mm_malloc.h>
 #include <stdlib.h>
 
 #include "hodl-gate.h"
diff --git a/algo/hodl/hodl-wolf.c b/algo/hodl/hodl-wolf.c
index 7ce79da..ea3c777 100644
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -1,7 +1,7 @@
 #include <string.h>
 #include <openssl/evp.h>
 #include <openssl/sha.h>
-#include <x86intrin.h>
+#include "simd-utils.h"
 #include "sha512-avx.h"
 #include "wolf-aes.h"
 #include "hodl-gate.h"
diff --git a/algo/hodl/hodl-wolf.h b/algo/hodl/hodl-wolf.h
index 47c8fb8..679d359 100644
--- a/algo/hodl/hodl-wolf.h
+++ b/algo/hodl/hodl-wolf.h
@@ -2,7 +2,7 @@
 #define __HODL_H
 
 #include <stdint.h>
-#include <x86intrin.h>
+#include "simd-utils.h"
 #include "miner.h"
 
 #define AES_ITERATIONS 		15
@@ -16,7 +16,7 @@
 typedef union _CacheEntry
 {
 	uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16)));
-	__m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
+	v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
 } CacheEntry;
 
 int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,
diff --git a/algo/hodl/sha512-avx.h b/algo/hodl/sha512-avx.h
index 6fbb5bf..bbc8b3b 100644
--- a/algo/hodl/sha512-avx.h
+++ b/algo/hodl/sha512-avx.h
@@ -2,7 +2,7 @@
 #define _SHA512_H
 
 #include <stdint.h>
-#include "emmintrin.h"
+#include "simd-utils.h"
 
 //SHA-512 block size
 #define SHA512_BLOCK_SIZE 128
@@ -24,8 +24,8 @@ typedef struct
    __m256i w[80];
 #elif defined(__SSE4_2__)
 //#elif defined(__AVX__)
-   __m128i h[8];
-   __m128i w[80];
+   v128_t h[8];
+   v128_t w[80];
 #else
    int dummy;
 #endif
diff --git a/algo/hodl/wolf-aes.h b/algo/hodl/wolf-aes.h
index b33407f..7aa6364 100644
--- a/algo/hodl/wolf-aes.h
+++ b/algo/hodl/wolf-aes.h
@@ -2,9 +2,9 @@
 #define __WOLF_AES_H
 
 #include <stdint.h>
-#include <x86intrin.h>
+#include "simd-utils.h"
 
-void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
+void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf);
 
 #if defined(__SSE4_2__)
 //#ifdef __AVX__
@@ -12,13 +12,13 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
 #define AES_PARALLEL_N 8
 #define BLOCK_COUNT 256
 
-void AES256CBC( __m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
-                __m128i* IV );
+void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16],
+                v128_t* IV );
 
 #else
 
-void AES256CBC( __m128i *Ciphertext, const __m128i *Plaintext,
-               const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount );
+void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext,
+               const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount );
 
 #endif
 
diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c
index 95f437e..6bc1b2c 100644
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -8,10 +8,10 @@
 
 void keccakhash_8way(void *state, const void *input)
 {
-    keccak256_8way_context ctx;
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, input, 80 );
-    keccak256_8way_close( &ctx, state );
+    keccak256_8x64_context ctx;
+    keccak256_8x64_init( &ctx );
+    keccak256_8x64_update( &ctx, input, 80 );
+    keccak256_8x64_close( &ctx, state );
 }
 
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
@@ -61,10 +61,10 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
 
 void keccakhash_4way(void *state, const void *input)
 {
-    keccak256_4way_context ctx;
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, state );
+    keccak256_4x64_context ctx;
+    keccak256_4x64_init( &ctx );
+    keccak256_4x64_update( &ctx, input, 80 );
+    keccak256_4x64_close( &ctx, state );
 }
 
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c
index 5830c17..49cb6bb 100644
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -207,30 +207,30 @@ void keccak256_8way_init( void *kc )
 }
 
 void
-keccak256_8way_update(void *cc, const void *data, size_t len)
+keccak256_8x64_update(void *cc, const void *data, size_t len)
 {
     keccak64_8way_core(cc, data, len, 136);
 }
 
 void
-keccak256_8way_close(void *cc, void *dst)
+keccak256_8x64_close(void *cc, void *dst)
 {
     keccak64_8way_close(cc, dst, 32, 136);
 }
 
-void keccak512_8way_init( void *kc )
+void keccak512_8x64_init( void *kc )
 {
    keccak64_8way_init( kc, 512 );
 }
 
 void
-keccak512_8way_update(void *cc, const void *data, size_t len)
+keccak512_8x64_update(void *cc, const void *data, size_t len)
 {
         keccak64_8way_core(cc, data, len, 72);
 }
 
 void
-keccak512_8way_close(void *cc, void *dst)
+keccak512_8x64_close(void *cc, void *dst)
 {
         keccak64_8way_close(cc, dst, 64, 72);
 }
@@ -395,24 +395,24 @@ void keccak256_4way_init( void *kc )
 }
 
 void
-keccak256_4way_update(void *cc, const void *data, size_t len)
+keccak256_4x64_update(void *cc, const void *data, size_t len)
 {
     keccak64_core(cc, data, len, 136);
 }
 
 void
-keccak256_4way_close(void *cc, void *dst)
+keccak256_4x64_close(void *cc, void *dst)
 {
     keccak64_close(cc, dst, 32, 136);
 }
 
-void keccak512_4way_init( void *kc )
+void keccak512_4x64_init( void *kc )
 {
    keccak64_init( kc, 512 );
 }
 
 void
-keccak512_4way_update(void *cc, const void *data, size_t len)
+keccak512_4x64_update(void *cc, const void *data, size_t len)
 {
    keccak64_core(cc, data, len, 72);
 }
diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h
index 2055409..2606891 100644
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -1,64 +1,94 @@
 #ifndef KECCAK_HASH_4WAY_H__
 #define KECCAK_HASH_4WAY_H__
 
-#ifdef  __AVX2__
-
 #include <stddef.h>
 #include "simd-utils.h"
 
-/**
- * This structure is a context for Keccak computations: it contains the
- * intermediate values and some data from the last entered block. Once a
- * Keccak computation has been performed, the context can be reused for
- * another computation.
- *
- * The contents of this structure are private. A running Keccak computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
-typedef struct {
-        __m512i buf[144*8];
-        __m512i w[25];
-        size_t ptr, lim;
+typedef struct
+{
+   __m512i buf[144*8];
+   __m512i w[25];
+   size_t ptr, lim;
 } keccak64_ctx_m512i __attribute__((aligned(128)));
 
-typedef keccak64_ctx_m512i keccak256_8way_context;
-typedef keccak64_ctx_m512i keccak512_8way_context;
+typedef keccak64_ctx_m512i keccak256_8x64_context;
+typedef keccak64_ctx_m512i keccak512_8x64_context;
 
-void keccak256_8way_init(void *cc);
-void keccak256_8way_update(void *cc, const void *data, size_t len);
-void keccak256_8way_close(void *cc, void *dst);
+void keccak256_8x64_init(void *cc);
+void keccak256_8x64_update(void *cc, const void *data, size_t len);
+void keccak256_8x64_close(void *cc, void *dst);
 
-void keccak512_8way_init(void *cc);
-void keccak512_8way_update(void *cc, const void *data, size_t len);
-void keccak512_8way_close(void *cc, void *dst);
-void keccak512_8way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);
+void keccak512_8x64_init(void *cc);
+void keccak512_8x64_update(void *cc, const void *data, size_t len);
+void keccak512_8x64_close(void *cc, void *dst);
+
+// legacy naming
+#define keccak512_8way_context keccak512_8x64_context
+#define keccak512_8way_init    keccak512_8x64_init
+#define keccak512_8way_update  keccak512_8x64_update
+#define keccak512_8way_close   keccak512_8x64_close
+#define keccak256_8way_context keccak256_8x64_context
+#define keccak256_8way_init    keccak256_8x64_init
+#define keccak256_8way_update  keccak256_8x64_update
+#define keccak256_8way_close   keccak256_8x64_close
 
 #endif   
 
-typedef struct {
-        __m256i buf[144*8];  
-        __m256i w[25];
-        size_t ptr, lim;
+#if defined(__AVX2__)
+
+typedef struct
+{
+   __m256i buf[144*8];  
+   __m256i w[25];
+   size_t ptr, lim;
 } keccak64_ctx_m256i __attribute__((aligned(128)));
 
-typedef keccak64_ctx_m256i keccak256_4way_context;
-typedef keccak64_ctx_m256i keccak512_4way_context;
+typedef keccak64_ctx_m256i keccak256_4x64_context;
+typedef keccak64_ctx_m256i keccak512_4x64_context;
 
-void keccak256_4way_init(void *cc);
-void keccak256_4way_update(void *cc, const void *data, size_t len);
-void keccak256_4way_close(void *cc, void *dst);
+void keccak256_4x64_init(void *cc);
+void keccak256_4x64_update(void *cc, const void *data, size_t len);
+void keccak256_4x64_close(void *cc, void *dst);
 
-void keccak512_4way_init(void *cc);
-void keccak512_4way_update(void *cc, const void *data, size_t len);
-void keccak512_4way_close(void *cc, void *dst);
-void keccak512_4way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);
+void keccak512_4x64_init(void *cc);
+void keccak512_4x64_update(void *cc, const void *data, size_t len);
+void keccak512_4x64_close(void *cc, void *dst);
+
+// legacy naming
+#define keccak512_4way_context keccak512_4x64_context
+#define keccak512_4way_init    keccak512_4x64_init
+#define keccak512_4way_update  keccak512_4x64_update
+#define keccak512_4way_close   keccak512_4x64_close
+#define keccak256_4way_context keccak256_4x64_context
+#define keccak256_4way_init    keccak256_4x64_init
+#define keccak256_4way_update  keccak256_4x64_update
+#define keccak256_4way_close   keccak256_4x64_close
+
+#endif
+
+#if defined(__SSE2__) || defined(__ARM_NEON)
+
+typedef struct
+{
+    v128_t buf[144*4];
+    v128_t w[50];
+    size_t ptr, lim;
+} keccak32_ctx_v128 __attribute__((aligned(64)));
+
+typedef keccak32_ctx_v128 keccak256_4x32_context;
+typedef keccak32_ctx_v128 keccak512_4x32_context;
+
+void keccak256_4x32_init(void *cc);
+void keccak256_4x32_update(void *cc, const void *data, size_t len);
+void keccak256_4x32_close(void *cc, void *dst);
+
+void keccak512_4x32_init(void *cc);
+void keccak512_4x32_update(void *cc, const void *data, size_t len);
+void keccak512_4x32_close(void *cc, void *dst);
 
 #endif
 
 #endif
+
diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c
index ca5ab72..d11df12 100644
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -11,13 +11,13 @@ void sha3d_hash_8way(void *state, const void *input)
     uint32_t buffer[16*8] __attribute__ ((aligned (128)));
     keccak256_8way_context ctx;
 
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, input, 80 );
-    keccak256_8way_close( &ctx, buffer );
+    keccak256_8x64_init( &ctx );
+    keccak256_8x64_update( &ctx, input, 80 );
+    keccak256_8x64_close( &ctx, buffer );
 
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, buffer, 32 );
-    keccak256_8way_close( &ctx, state );
+    keccak256_8x64_init( &ctx );
+    keccak256_8x64_update( &ctx, buffer, 32 );
+    keccak256_8x64_close( &ctx, state );
 }
 
 int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
@@ -71,13 +71,13 @@ void sha3d_hash_4way(void *state, const void *input)
     uint32_t buffer[16*4] __attribute__ ((aligned (64)));
     keccak256_4way_context ctx;
 
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, buffer );
+    keccak256_4x64_init( &ctx );
+    keccak256_4x64_update( &ctx, input, 80 );
+    keccak256_4x64_close( &ctx, buffer );
 
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, buffer, 32 );
-    keccak256_4way_close( &ctx, state );
+    keccak256_4x64_init( &ctx );
+    keccak256_4x64_update( &ctx, buffer, 32 );
+    keccak256_4x64_close( &ctx, state );
 }
 
 int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c
index f9b049b..45e27fa 100644
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -1,5 +1,4 @@
 #include <string.h>
-#include <immintrin.h>
 #include "luffa-hash-2way.h"
 #include <stdio.h>
 
diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c
index 043e488..820ed1b 100644
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -22,18 +22,18 @@
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"
 
-#define cns(i)  ( ( (__m128i*)CNS_INIT)[i] )
+#define cns(i)  ( ( (v128_t*)CNS_INIT)[i] )
 
 #define ADD_CONSTANT( a, b, c0 ,c1 ) \
-    a = _mm_xor_si128( a, c0 ); \
-    b = _mm_xor_si128( b, c1 ); \
+    a = v128_xor( a, c0 ); \
+    b = v128_xor( b, c1 ); \
 
 #if defined(__AVX512VL__)
 //TODO enable for AVX10_512 AVX10_256
 
 #define MULT2( a0, a1 ) \
 { \
-  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
+  v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
   a0 = _mm_alignr_epi8( a1, b, 4 ); \
   a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }
@@ -42,20 +42,35 @@
 
 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, \
+  v128_t b = v128_xor( a0, \
                       _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
   a0 = _mm_alignr_epi8( a1, b, 4 ); \
   a1 = _mm_alignr_epi8( b, a1, 4 ); \
 } while(0)
 
-#else
+#elif defined(__ARM_NEON)
+
+#pragma message "NEON for Luffa"
+
+const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
+
+// { a1_0, 0, a1_0, a1_0 }
+#define MULT2( a0, a1 ) \
+{ \
+  v128_t b = v128_xor( a0, \
+           v128_and( v128_32( vgetq_lane_u32( a1, 0 ) ), mask ) ); \
+  a0 = v128_alignr32( a1, b, 1 ); \
+  a1 = v128_alignr32( b, a1, 1 ); \
+}
+
+#else   // assume SSE2
 
 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, \
-                      _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
+  v128_t b = v128_xor( a0, \
+                      _mm_shuffle_epi32( v128_and( a1, MASK ), 0x10 ) ); \
+  a0 = v128_or( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
 } while(0)
 
 #endif
@@ -65,16 +80,16 @@
 
 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
-    __m128i t = a0; \
+    v128_t t = a0; \
     a0 = mm128_xoror( a3, a0, a1 ); \
-    a2 = _mm_xor_si128( a2, a3 ); \
+    a2 = v128_xor( a2, a3 ); \
     a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
     a3 = mm128_xorand( a2, a3, t ); \
     a2 = mm128_xorand( a1, a2, a0 ); \
-    a1 = _mm_or_si128( a1, a3 ); \
-    a3 = _mm_xor_si128( a3, a2 ); \
-    t  = _mm_xor_si128( t, a1 ); \
-    a2 = _mm_and_si128( a2, a1 ); \
+    a1 = v128_or( a1, a3 ); \
+    a3 = v128_xor( a3, a2 ); \
+    t  = v128_xor( t, a1 ); \
+    a2 = v128_and( a2, a1 ); \
     a1 = mm128_xnor( a1, a0 ); \
     a0 = t; \
 }
@@ -83,33 +98,33 @@
 
 #define SUBCRUMB( a0, a1, a2, a3 ) \
 { \
-    __m128i t = a0; \
-    a0 = _mm_or_si128( a0, a1 ); \
-    a2 = _mm_xor_si128( a2, a3 ); \
-    a1 = mm128_not( a1 ); \
-    a0 = _mm_xor_si128( a0, a3 ); \
-    a3 = _mm_and_si128( a3, t ); \
-    a1 = _mm_xor_si128( a1, a3 ); \
-    a3 = _mm_xor_si128( a3, a2 ); \
-    a2 = _mm_and_si128( a2, a0 ); \
-    a0 = mm128_not( a0 ); \
-    a2 = _mm_xor_si128( a2, a1 ); \
-    a1 = _mm_or_si128(  a1, a3 ); \
-    t  = _mm_xor_si128( t , a1 ); \
-    a3 = _mm_xor_si128( a3, a2 ); \
-    a2 = _mm_and_si128( a2, a1 ); \
-    a1 = _mm_xor_si128( a1, a0 ); \
+    v128_t t = a0; \
+    a0 = v128_or( a0, a1 ); \
+    a2 = v128_xor( a2, a3 ); \
+    a1 = v128_not( a1 ); \
+    a0 = v128_xor( a0, a3 ); \
+    a3 = v128_and( a3, t ); \
+    a1 = v128_xor( a1, a3 ); \
+    a3 = v128_xor( a3, a2 ); \
+    a2 = v128_and( a2, a0 ); \
+    a0 = v128_not( a0 ); \
+    a2 = v128_xor( a2, a1 ); \
+    a1 = v128_or(  a1, a3 ); \
+    t  = v128_xor( t , a1 ); \
+    a3 = v128_xor( a3, a2 ); \
+    a2 = v128_and( a2, a1 ); \
+    a1 = v128_xor( a1, a0 ); \
     a0 = t; \
 }
 
 #endif
 
 #define MIXWORD( a, b ) \
-    b = _mm_xor_si128( a, b ); \
-    a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \
-    b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \
-    a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \
-    b = mm128_rol_32( b, 1 );
+    b = v128_xor( a, b ); \
+    a = v128_xor( b, v128_rol32( a, 2 ) ); \
+    b = v128_xor( a, v128_rol32( b, 14 ) ); \
+    a = v128_xor( b, v128_rol32( a, 10 ) ); \
+    b = v128_rol32( b, 1 );
 
 #define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
     SUBCRUMB( x0, x1, x2, x3 ); \
@@ -121,105 +136,47 @@
     ADD_CONSTANT( x0, x4, c0, c1 );
 
 #define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
-    t0 = _mm_shuffle_epi32( a1, 147 ); \
-    a1 = _mm_unpacklo_epi32( t0, a0 ); \
-    t0 = _mm_unpackhi_epi32( t0, a0 ); \
-    t1 = _mm_shuffle_epi32( t0, 78 ); \
-    a0 = _mm_shuffle_epi32( a1, 78 ); \
+    t0 = v128_shufll32( a1 ); \
+    a1 = v128_unpacklo32( t0, a0 ); \
+    t0 = v128_unpackhi32( t0, a0 ); \
+    t1 = v128_swap64( t0 ); \
+    a0 = v128_swap64( a1 ); \
     SUBCRUMB( t1, t0, a0, a1 ); \
-    t0 = _mm_unpacklo_epi32( t0, t1 ); \
-    a1 = _mm_unpacklo_epi32( a1, a0 ); \
-    a0 = _mm_unpackhi_epi64( a1, t0 ); \
-    a1 = _mm_unpacklo_epi64( a1, t0 ); \
-    a1 = _mm_shuffle_epi32( a1, 57 ); \
+    t0 = v128_unpacklo32( t0, t1 ); \
+    a1 = v128_unpacklo32( a1, a0 ); \
+    a0 = v128_unpackhi64( a1, t0 ); \
+    a1 = v128_unpacklo64( a1, t0 ); \
+    a1 = v128_shuflr32( a1 ); \
     MIXWORD( a0, a1 ); \
     ADD_CONSTANT( a0, a1, c0, c1 );
 
-#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
-    s2 = _mm_load_si128(&r1);\
-    q2 = _mm_load_si128(&p1);\
-    r2 = _mm_shuffle_epi32(r2,216);\
-    p2 = _mm_shuffle_epi32(p2,216);\
-    r1 = _mm_unpacklo_epi32(r1,r0);\
-    p1 = _mm_unpacklo_epi32(p1,p0);\
-    s2 = _mm_unpackhi_epi32(s2,r0);\
-    q2 = _mm_unpackhi_epi32(q2,p0);\
-    s0 = _mm_load_si128(&r2);\
-    q0 = _mm_load_si128(&p2);\
-    r2 = _mm_unpacklo_epi64(r2,r1);\
-    p2 = _mm_unpacklo_epi64(p2,p1);\
-    s1 = _mm_load_si128(&s0);\
-    q1 = _mm_load_si128(&q0);\
-    s0 = _mm_unpackhi_epi64(s0,r1);\
-    q0 = _mm_unpackhi_epi64(q0,p1);\
-    r2 = _mm_shuffle_epi32(r2,225);\
-    p2 = _mm_shuffle_epi32(p2,225);\
-    r0 = _mm_load_si128(&s1);\
-    p0 = _mm_load_si128(&q1);\
-    s0 = _mm_shuffle_epi32(s0,225);\
-    q0 = _mm_shuffle_epi32(q0,225);\
-    s1 = _mm_unpacklo_epi64(s1,s2);\
-    q1 = _mm_unpacklo_epi64(q1,q2);\
-    r0 = _mm_unpackhi_epi64(r0,s2);\
-    p0 = _mm_unpackhi_epi64(p0,q2);\
-    s2 = _mm_load_si128(&r0);\
-    q2 = _mm_load_si128(&p0);\
-    s3 = _mm_load_si128(&r2);\
-    q3 = _mm_load_si128(&p2);\
-
-#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
-    s0 = _mm_load_si128(&r0);\
-    q0 = _mm_load_si128(&p0);\
-    s1 = _mm_load_si128(&r2);\
-    q1 = _mm_load_si128(&p2);\
-    r0 = _mm_unpackhi_epi32(r0,r1);\
-    p0 = _mm_unpackhi_epi32(p0,p1);\
-    r2 = _mm_unpackhi_epi32(r2,r3);\
-    p2 = _mm_unpackhi_epi32(p2,p3);\
-    s0 = _mm_unpacklo_epi32(s0,r1);\
-    q0 = _mm_unpacklo_epi32(q0,p1);\
-    s1 = _mm_unpacklo_epi32(s1,r3);\
-    q1 = _mm_unpacklo_epi32(q1,p3);\
-    r1 = _mm_load_si128(&r0);\
-    p1 = _mm_load_si128(&p0);\
-    r0 = _mm_unpackhi_epi64(r0,r2);\
-    p0 = _mm_unpackhi_epi64(p0,p2);\
-    s0 = _mm_unpackhi_epi64(s0,s1);\
-    q0 = _mm_unpackhi_epi64(q0,q1);\
-    r1 = _mm_unpacklo_epi64(r1,r2);\
-    p1 = _mm_unpacklo_epi64(p1,p2);\
-    s2 = _mm_load_si128(&r0);\
-    q2 = _mm_load_si128(&p0);\
-    s1 = _mm_load_si128(&r1);\
-    q1 = _mm_load_si128(&p1);\
-
 #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
-    s1 = _mm_unpackhi_epi32( r3, r2 ); \
-    q1 = _mm_unpackhi_epi32( p3, p2 ); \
-    s3 = _mm_unpacklo_epi32( r3, r2 ); \
-    q3 = _mm_unpacklo_epi32( p3, p2 ); \
-    r3 = _mm_unpackhi_epi32( r1, r0 ); \
-    r1 = _mm_unpacklo_epi32( r1, r0 ); \
-    p3 = _mm_unpackhi_epi32( p1, p0 ); \
-    p1 = _mm_unpacklo_epi32( p1, p0 ); \
-    s0 = _mm_unpackhi_epi64( s1, r3 ); \
-    q0 = _mm_unpackhi_epi64( q1 ,p3 ); \
-    s1 = _mm_unpacklo_epi64( s1, r3 ); \
-    q1 = _mm_unpacklo_epi64( q1, p3 ); \
-    s2 = _mm_unpackhi_epi64( s3, r1 ); \
-    q2 = _mm_unpackhi_epi64( q3, p1 ); \
-    s3 = _mm_unpacklo_epi64( s3, r1 ); \
-    q3 = _mm_unpacklo_epi64( q3, p1 );
+    s1 = v128_unpackhi32( r3, r2 ); \
+    q1 = v128_unpackhi32( p3, p2 ); \
+    s3 = v128_unpacklo32( r3, r2 ); \
+    q3 = v128_unpacklo32( p3, p2 ); \
+    r3 = v128_unpackhi32( r1, r0 ); \
+    r1 = v128_unpacklo32( r1, r0 ); \
+    p3 = v128_unpackhi32( p1, p0 ); \
+    p1 = v128_unpacklo32( p1, p0 ); \
+    s0 = v128_unpackhi64( s1, r3 ); \
+    q0 = v128_unpackhi64( q1 ,p3 ); \
+    s1 = v128_unpacklo64( s1, r3 ); \
+    q1 = v128_unpacklo64( q1, p3 ); \
+    s2 = v128_unpackhi64( s3, r1 ); \
+    q2 = v128_unpackhi64( q3, p1 ); \
+    s3 = v128_unpacklo64( s3, r1 ); \
+    q3 = v128_unpacklo64( q3, p1 );
 
 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
     NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
 
-static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
+static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 );
 
-static void finalization512( hashState_luffa *state, uint32 *b );
+static void finalization512( hashState_luffa *state, uint32_t *b );
 
 /* initial values of chaining variables */
-static const uint32 IV[40] __attribute((aligned(16))) = {
+static const uint32_t IV[40] __attribute((aligned(16))) = {
     0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
     0xdef610bb,0xee058139,0x90152df4,0x6e292011,
     0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
@@ -233,7 +190,7 @@ static const uint32 IV[40] __attribute((aligned(16))) = {
 };
 
 /* Round Constants */
-static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
+static const uint32_t CNS_INIT[128] __attribute((aligned(16))) = {
     0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
     0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
     0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
@@ -269,29 +226,29 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
 };
 
 
-__m128i CNS128[32];
+v128_t CNS128[32];
 #if !defined(__SSE4_1__)
-__m128i MASK;
+v128_t MASK;
 #endif
 
-HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
+int init_luffa(hashState_luffa *state, int hashbitlen)
 {
     int i;
     state->hashbitlen = hashbitlen;
 #if !defined(__SSE4_1__)
     /* set the lower 32 bits to '1' */
-    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
+    MASK = v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
 #endif
     /* set the 32-bit round constant values to the 128-bit data field */
     for ( i=0; i<32; i++ )
-        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
+        CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
     for ( i=0; i<10; i++ ) 
-	state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
+	state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
     memset(state->buffer, 0, sizeof state->buffer );
-    return SUCCESS;
+    return 0;
 }
 
-HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
+int update_luffa( hashState_luffa *state, const void *data,
                          size_t len )
 {
     int i;
@@ -301,8 +258,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
     // full blocks
     for ( i = 0; i < blocks; i++ )
     {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
+                      v128_bswap32( casti_v128( data, 0 ) ) );
        data += MSG_BLOCK_BYTE_LEN;
     }
 
@@ -311,37 +268,37 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
     if ( state->rembytes  )
     {
       // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
+      casti_v128( state->buffer, 0 ) = v128_bswap32( cast_v128( data ) );
       // padding of partial block
-      casti_m128i( state->buffer, 1 ) =  _mm_set_epi32( 0, 0, 0, 0x80000000 );
+      casti_v128( state->buffer, 1 ) =  v128_set32( 0, 0, 0, 0x80000000 );
     }
 
-    return SUCCESS;
+    return 0;
 }
 
-HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval) 
+int final_luffa(hashState_luffa *state, void *hashval) 
 {
     // transform pad block
     if ( state->rembytes )
     {
       // not empty, data is in buffer
-      rnd512( state, casti_m128i( state->buffer, 1 ),
-                     casti_m128i( state->buffer, 0 ) );
+      rnd512( state, casti_v128( state->buffer, 1 ),
+                     casti_v128( state->buffer, 0 ) );
     }
     else
     {
       // empty pad block, constant data
-     rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) );
+     rnd512( state, v128_zero, v128_set32( 0, 0, 0, 0x80000000 ) );
     }
 
-    finalization512(state, (uint32*) hashval);
+    finalization512(state, (uint32_t*) hashval);
     if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( hashval+128 ) );
-    return SUCCESS;
+        finalization512( state, (uint32_t*)( hashval+128 ) );
+    return 0;
 }
 
-HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
-              const BitSequence* data, size_t inlen )
+int update_and_final_luffa( hashState_luffa *state, void* output,
+              const void* data, size_t inlen )
 {
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
     int i;
@@ -351,43 +308,43 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
     // full blocks
     for ( i = 0; i < blocks; i++ )
     {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
+                      v128_bswap32( casti_v128( data, 0 ) ) );
        data += MSG_BLOCK_BYTE_LEN;
     }
 
     // 16 byte partial block exists for 80 byte len
     if ( state->rembytes  )
        // padding of partial block
-       rnd512( state, mm128_mov64_128(  0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
+       rnd512( state, v128_mov64(  0x80000000 ),
+                      v128_bswap32( cast_v128( data ) ) );
     else
        // empty pad block
-       rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
+       rnd512( state, v128_zero, v128_64( 0x80000000 ) );
 
-    finalization512( state, (uint32*) output );
+    finalization512( state, (uint32_t*) output );
     if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( output+128 ) );
+        finalization512( state, (uint32_t*)( output+128 ) );
 
-    return SUCCESS;
+    return 0;
 }
 
 
-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-              const BitSequence* data, size_t inlen )
+int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
+              const void* data, size_t inlen )
 {
 // Optimized for integrals of 16 bytes, good for 64 and 80 byte len
     int i;
     state->hashbitlen = hashbitlen;
 #if !defined(__SSE4_1__)
     /* set the lower 32 bits to '1' */
-    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
+    MASK= v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
 #endif
     /* set the 32-bit round constant values to the 128-bit data field */
     for ( i=0; i<32; i++ )
-        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
+        CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
     for ( i=0; i<10; i++ )
-    state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
+    state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
     memset(state->buffer, 0, sizeof state->buffer );
 
     // update
@@ -398,8 +355,8 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
     // full blocks
     for ( i = 0; i < blocks; i++ )
     {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
+                      v128_bswap32( casti_v128( data, 0 ) ) );
        data += MSG_BLOCK_BYTE_LEN;
     }
 
@@ -408,17 +365,17 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
     // 16 byte partial block exists for 80 byte len
     if ( state->rembytes  )
        // padding of partial block
-       rnd512( state, mm128_mov64_128( 0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
+       rnd512( state, v128_mov64( 0x80000000 ),
+                      v128_bswap32( cast_v128( data ) ) );
     else
        // empty pad block
-       rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
+       rnd512( state, v128_zero, v128_mov64( 0x80000000 ) );
 
-    finalization512( state, (uint32*) output );
+    finalization512( state, (uint32_t*) output );
     if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( output+128 ) );
+        finalization512( state, (uint32_t*)( output+128 ) );
 
-    return SUCCESS;
+    return 0;
 }
 
 
@@ -426,97 +383,97 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
 /* Round function         */
 /* state: hash context    */
 
-static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
+static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
 {
-    __m128i t0, t1;
-    __m128i *chainv = state->chainv;
-    __m128i x0, x1, x2, x3, x4, x5, x6, x7; 
+    v128_t t0, t1;
+    v128_t *chainv = state->chainv;
+    v128_t x0, x1, x2, x3, x4, x5, x6, x7; 
 
-    t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] );
-    t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] );
-    t0 = mm128_xor3( t0, chainv[6], chainv[8] );
-    t1 = mm128_xor3( t1, chainv[7], chainv[9] );
+    t0 = v128_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = v128_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = v128_xor3( t0, chainv[6], chainv[8] );
+    t1 = v128_xor3( t1, chainv[7], chainv[9] );
 
     MULT2( t0, t1 );
 
-    msg0 = _mm_shuffle_epi32( msg0, 27 );
-    msg1 = _mm_shuffle_epi32( msg1, 27 );
+    msg0 = v128_rev32( msg0 );
+    msg1 = v128_rev32( msg1 );
 
-    chainv[0] = _mm_xor_si128( chainv[0], t0 );
-    chainv[1] = _mm_xor_si128( chainv[1], t1 );
-    chainv[2] = _mm_xor_si128( chainv[2], t0 );
-    chainv[3] = _mm_xor_si128( chainv[3], t1 );
-    chainv[4] = _mm_xor_si128( chainv[4], t0 );
-    chainv[5] = _mm_xor_si128( chainv[5], t1 );
-    chainv[6] = _mm_xor_si128( chainv[6], t0 );
-    chainv[7] = _mm_xor_si128( chainv[7], t1 );
-    chainv[8] = _mm_xor_si128( chainv[8], t0 );
-    chainv[9] = _mm_xor_si128( chainv[9], t1 );
+    chainv[0] = v128_xor( chainv[0], t0 );
+    chainv[1] = v128_xor( chainv[1], t1 );
+    chainv[2] = v128_xor( chainv[2], t0 );
+    chainv[3] = v128_xor( chainv[3], t1 );
+    chainv[4] = v128_xor( chainv[4], t0 );
+    chainv[5] = v128_xor( chainv[5], t1 );
+    chainv[6] = v128_xor( chainv[6], t0 );
+    chainv[7] = v128_xor( chainv[7], t1 );
+    chainv[8] = v128_xor( chainv[8], t0 );
+    chainv[9] = v128_xor( chainv[9], t1 );
 
     t0 = chainv[0];
     t1 = chainv[1];
 
     MULT2( chainv[0], chainv[1]);
-    chainv[0] = _mm_xor_si128( chainv[0], chainv[2] );
-    chainv[1] = _mm_xor_si128( chainv[1], chainv[3] );
+    chainv[0] = v128_xor( chainv[0], chainv[2] );
+    chainv[1] = v128_xor( chainv[1], chainv[3] );
 
     MULT2( chainv[2], chainv[3]);
-    chainv[2] = _mm_xor_si128(chainv[2], chainv[4]);
-    chainv[3] = _mm_xor_si128(chainv[3], chainv[5]);
+    chainv[2] = v128_xor(chainv[2], chainv[4]);
+    chainv[3] = v128_xor(chainv[3], chainv[5]);
 
     MULT2( chainv[4], chainv[5]);
-    chainv[4] = _mm_xor_si128(chainv[4], chainv[6]);
-    chainv[5] = _mm_xor_si128(chainv[5], chainv[7]);
+    chainv[4] = v128_xor(chainv[4], chainv[6]);
+    chainv[5] = v128_xor(chainv[5], chainv[7]);
 
     MULT2( chainv[6], chainv[7]);
-    chainv[6] = _mm_xor_si128(chainv[6], chainv[8]);
-    chainv[7] = _mm_xor_si128(chainv[7], chainv[9]);
+    chainv[6] = v128_xor(chainv[6], chainv[8]);
+    chainv[7] = v128_xor(chainv[7], chainv[9]);
 
     MULT2( chainv[8], chainv[9]);
-    t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 );
-    t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 );
+    t0 = chainv[8] = v128_xor( chainv[8], t0 );
+    t1 = chainv[9] = v128_xor( chainv[9], t1 );
 
     MULT2( chainv[8], chainv[9]);
-    chainv[8] = _mm_xor_si128( chainv[8], chainv[6] );
-    chainv[9] = _mm_xor_si128( chainv[9], chainv[7] );
+    chainv[8] = v128_xor( chainv[8], chainv[6] );
+    chainv[9] = v128_xor( chainv[9], chainv[7] );
 
     MULT2( chainv[6], chainv[7]);
-    chainv[6] = _mm_xor_si128( chainv[6], chainv[4] );
-    chainv[7] = _mm_xor_si128( chainv[7], chainv[5] );
+    chainv[6] = v128_xor( chainv[6], chainv[4] );
+    chainv[7] = v128_xor( chainv[7], chainv[5] );
 
     MULT2( chainv[4], chainv[5]);
-    chainv[4] = _mm_xor_si128( chainv[4], chainv[2] );
-    chainv[5] = _mm_xor_si128( chainv[5], chainv[3] );
+    chainv[4] = v128_xor( chainv[4], chainv[2] );
+    chainv[5] = v128_xor( chainv[5], chainv[3] );
 
     MULT2( chainv[2], chainv[3] );
-    chainv[2] = _mm_xor_si128( chainv[2], chainv[0] );
-    chainv[3] = _mm_xor_si128( chainv[3], chainv[1] );
+    chainv[2] = v128_xor( chainv[2], chainv[0] );
+    chainv[3] = v128_xor( chainv[3], chainv[1] );
 
     MULT2( chainv[0], chainv[1] );
-    chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 );
-    chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 );
+    chainv[0] = v128_xor( v128_xor( chainv[0], t0 ), msg0 );
+    chainv[1] = v128_xor( v128_xor( chainv[1], t1 ), msg1 );
 
     MULT2( msg0, msg1);
-    chainv[2] = _mm_xor_si128( chainv[2], msg0 );
-    chainv[3] = _mm_xor_si128( chainv[3], msg1 );
+    chainv[2] = v128_xor( chainv[2], msg0 );
+    chainv[3] = v128_xor( chainv[3], msg1 );
 
     MULT2( msg0, msg1);
-    chainv[4] = _mm_xor_si128( chainv[4], msg0 );
-    chainv[5] = _mm_xor_si128( chainv[5], msg1 );
+    chainv[4] = v128_xor( chainv[4], msg0 );
+    chainv[5] = v128_xor( chainv[5], msg1 );
 
     MULT2( msg0, msg1);
-    chainv[6] = _mm_xor_si128( chainv[6], msg0 );
-    chainv[7] = _mm_xor_si128( chainv[7], msg1 );
+    chainv[6] = v128_xor( chainv[6], msg0 );
+    chainv[7] = v128_xor( chainv[7], msg1 );
 
     MULT2( msg0, msg1);
-    chainv[8] = _mm_xor_si128( chainv[8], msg0 );
-    chainv[9] = _mm_xor_si128( chainv[9], msg1 );
+    chainv[8] = v128_xor( chainv[8], msg0 );
+    chainv[9] = v128_xor( chainv[9], msg1 );
 
     MULT2( msg0, msg1);
-    chainv[3] = mm128_rol_32( chainv[3], 1 );    
-    chainv[5] = mm128_rol_32( chainv[5], 2 );
-    chainv[7] = mm128_rol_32( chainv[7], 3 );
-    chainv[9] = mm128_rol_32( chainv[9], 4 );
+    chainv[3] = v128_rol32( chainv[3], 1 );    
+    chainv[5] = v128_rol32( chainv[5], 2 );
+    chainv[7] = v128_rol32( chainv[7], 3 );
+    chainv[9] = v128_rol32( chainv[9], 4 );
     
     NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
                 chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
@@ -549,57 +506,57 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 /* state: hash context    */
 /* b[8]: hash values      */
 
-static void finalization512( hashState_luffa *state, uint32 *b )
+static void finalization512( hashState_luffa *state, uint32_t *b )
 {
-    uint32 hash[8] __attribute((aligned(64)));
-    __m128i* chainv = state->chainv;
-    __m128i t[2];
-    const __m128i zero = _mm_setzero_si128();
+    uint32_t hash[8] __attribute((aligned(64)));
+    v128_t* chainv = state->chainv;
+    v128_t t[2];
+    const v128_t zero = v128_zero;
 
     /*---- blank round with m=0 ----*/
     rnd512( state, zero, zero );
 
     t[0] = chainv[0];
     t[1] = chainv[1];
-    t[0] = _mm_xor_si128(t[0], chainv[2]);
-    t[1] = _mm_xor_si128(t[1], chainv[3]);
-    t[0] = _mm_xor_si128(t[0], chainv[4]);
-    t[1] = _mm_xor_si128(t[1], chainv[5]);
-    t[0] = _mm_xor_si128(t[0], chainv[6]);
-    t[1] = _mm_xor_si128(t[1], chainv[7]);
-    t[0] = _mm_xor_si128(t[0], chainv[8]);
-    t[1] = _mm_xor_si128(t[1], chainv[9]);
+    t[0] = v128_xor(t[0], chainv[2]);
+    t[1] = v128_xor(t[1], chainv[3]);
+    t[0] = v128_xor(t[0], chainv[4]);
+    t[1] = v128_xor(t[1], chainv[5]);
+    t[0] = v128_xor(t[0], chainv[6]);
+    t[1] = v128_xor(t[1], chainv[7]);
+    t[0] = v128_xor(t[0], chainv[8]);
+    t[1] = v128_xor(t[1], chainv[9]);
 
-    t[0] = _mm_shuffle_epi32(t[0], 27);
-    t[1] = _mm_shuffle_epi32(t[1], 27);
+    t[0] = v128_rev32( t[0] );
+    t[1] = v128_rev32( t[1] );
 
-    _mm_store_si128((__m128i*)&hash[0], t[0]);
-    _mm_store_si128((__m128i*)&hash[4], t[1]);
+    v128_store((v128_t*)&hash[0], t[0]);
+    v128_store((v128_t*)&hash[4], t[1]);
 
-    casti_m128i( b, 0 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
+    casti_v128( b, 0 ) = v128_bswap32( casti_v128( hash, 0 ) );
+    casti_v128( b, 1 ) = v128_bswap32( casti_v128( hash, 1 ) );
 
     rnd512( state, zero, zero );
 
     t[0] = chainv[0];
     t[1] = chainv[1];
-    t[0] = _mm_xor_si128(t[0], chainv[2]);
-    t[1] = _mm_xor_si128(t[1], chainv[3]);
-    t[0] = _mm_xor_si128(t[0], chainv[4]);
-    t[1] = _mm_xor_si128(t[1], chainv[5]);
-    t[0] = _mm_xor_si128(t[0], chainv[6]);
-    t[1] = _mm_xor_si128(t[1], chainv[7]);
-    t[0] = _mm_xor_si128(t[0], chainv[8]);
-    t[1] = _mm_xor_si128(t[1], chainv[9]);
+    t[0] = v128_xor(t[0], chainv[2]);
+    t[1] = v128_xor(t[1], chainv[3]);
+    t[0] = v128_xor(t[0], chainv[4]);
+    t[1] = v128_xor(t[1], chainv[5]);
+    t[0] = v128_xor(t[0], chainv[6]);
+    t[1] = v128_xor(t[1], chainv[7]);
+    t[0] = v128_xor(t[0], chainv[8]);
+    t[1] = v128_xor(t[1], chainv[9]);
 
-    t[0] = _mm_shuffle_epi32(t[0], 27);
-    t[1] = _mm_shuffle_epi32(t[1], 27);
+    t[0] = v128_rev32( t[0] );
+    t[1] = v128_rev32( t[1] );
 
-    _mm_store_si128((__m128i*)&hash[0], t[0]);
-    _mm_store_si128((__m128i*)&hash[4], t[1]);
+    casti_v128( hash, 0 ) = t[0];
+    casti_v128( hash, 1 ) = t[1];
 
-    casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
+    casti_v128( b, 2 ) = v128_bswap32( casti_v128( hash, 0 ) );
+    casti_v128( b, 3 ) = v128_bswap32( casti_v128( hash, 1 ) );
 }
 
 /***************************************************/
diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h
index aaa066e..bbad313 100644
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -21,8 +21,8 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
-#include <emmintrin.h>
-#include "compat/sha3-defs.h"
+//#include <emmintrin.h>
+//#include "compat/sha3-defs.h"
 /* The length of digests*/
 #define DIGEST_BIT_LEN_224 224
 #define DIGEST_BIT_LEN_256 256
@@ -49,23 +49,23 @@
 /*********************************/
 
 typedef struct {
-    uint32 buffer[8] __attribute((aligned(32)));
-    __m128i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    uint32_t buffer[8] __attribute((aligned(32)));
+    v128_t chainv[10] __attribute((aligned(32)));   /* Chaining values */
     int hashbitlen;
     int rembytes;
 } hashState_luffa;
 
-HashReturn init_luffa( hashState_luffa *state, int hashbitlen );
+int init_luffa( hashState_luffa *state, int hashbitlen );
 
 // len is in bytes
-HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
+int update_luffa( hashState_luffa *state, const void *data,
                          size_t len );
 
-HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
+int final_luffa( hashState_luffa *state, void *hashval );
 
-HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
-                                   const BitSequence* data, size_t inlen );
+int update_and_final_luffa( hashState_luffa *state, void* output,
+                                   const void* data, size_t inlen );
 
-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-                                   const BitSequence* data, size_t inlen );
+int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
+                                   const void* data, size_t inlen );
 #endif   // LUFFA_FOR_SSE2_H___
diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
index 9ec505b..68c5ec5 100644
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -1,5 +1,5 @@
 #include "lyra2-gate.h"
-
+#include <mm_malloc.h>
 
 // huge pages
 //
diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c
index 3b77139..49ac9ce 100644
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -63,7 +63,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
    if ( opt_benchmark )
       ptarget[7] = 0x0000ff;
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
    lyra2h_4way_midstate( vdata );
 
    do {
diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c
index 0211622..5632fdd 100644
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -353,9 +353,6 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
    return 0;
 }
 
-#endif
-
-/*
 #elif defined (LYRA2REV2_4WAY)
 
 typedef struct {
@@ -452,7 +449,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
    if ( opt_benchmark )
       ( (uint32_t*)ptarget )[7] = 0x0000ff;
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
 
    blake256_4way_init( &l2v2_4way_ctx.blake );
    blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
@@ -480,4 +477,4 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
 }
 
 #endif
-*/
+
diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c
index 6f14832..6443697 100644
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -371,7 +371,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
    if ( opt_benchmark )
       ( (uint32_t*)ptarget )[7] = 0x0000ff;
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
    *noncev = _mm_set_epi32( n+3, n+2, n+1, n );
 
    blake256_4way_init( &l2v3_4way_ctx.blake );
diff --git a/algo/lyra2/lyra2rev3.c b/algo/lyra2/lyra2rev3.c
index d1e5b51..7fa2428 100644
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -75,11 +75,11 @@ int scanhash_lyra2rev3( struct work *work,
 	((uint32_t*)ptarget)[7] = 0x0000ff;
 
    // need big endian data
-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_v128( endiandata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) );
+   casti_v128( endiandata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) );
+   casti_v128( endiandata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) );
+   casti_v128( endiandata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) );
+   casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
    l2v3_blake256_midstate( endiandata );
    do
    {
diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c
index 67b3e11..ccc212b 100644
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -312,7 +312,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
 
    if ( bench )   ptarget[7] = 0x0000ff;
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
    *noncev = _mm_set_epi32( n+3, n+2, n+1, n );
    lyra2z_4way_midstate( vdata );
 
diff --git a/algo/lyra2/lyra2z.c b/algo/lyra2/lyra2z.c
index 7ad7eee..638ca51 100644
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -53,7 +53,6 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 	uint32_t _ALIGN(64) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
-	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
    int thr_id = mythr->id; 
diff --git a/algo/lyra2/lyra2z330.c b/algo/lyra2/lyra2z330.c
index ef8b788..d235a16 100644
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -2,6 +2,7 @@
 #include "algo-gate-api.h"
 #include "lyra2.h"
 #include "simd-utils.h"
+#include <mm_malloc.h>
 
 static __thread uint64_t* lyra2z330_wholeMatrix;
 
@@ -29,11 +30,11 @@ int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
    if (opt_benchmark)
 	ptarget[7] = 0x0000ff;
 
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_v128( edata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) );
+   casti_v128( edata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) );
+   casti_v128( edata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) );
+   casti_v128( edata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) );
+   casti_v128( edata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
    
    do
    {
diff --git a/algo/lyra2/sponge-2way.c b/algo/lyra2/sponge-2way.c
index cb71249..572b021 100644
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -23,9 +23,9 @@
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
-#include <immintrin.h>
 #include "sponge.h"
 #include "lyra2.h"
+#include "simd-utils.h"
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c
index 72abce3..16c1d69 100644
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -22,7 +22,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
-#include <immintrin.h>
+#include "simd-utils.h"
 #include "sponge.h"
 #include "lyra2.h"
 
diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h
index 98728a7..bb11ce9 100644
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -195,7 +195,7 @@ static const uint64_t blake2b_IV[8] =
 
 #endif // AVX2 else SSE2
 
-/*
+
 // Scalar, not used.
 
 static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
@@ -223,7 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
     G(r,5,v[ 1],v[ 6],v[11],v[12]); \
     G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
     G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
-*/
+
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c
index ab8b79c..98b57e1 100644
--- a/algo/panama/panama-hash-4way.c
+++ b/algo/panama/panama-hash-4way.c
@@ -42,7 +42,7 @@ do { \
 //
 //    Panama-256 4 way SSE2
 
-#define LVAR17_4W(b)  __m128i \
+#define LVAR17_4W(b)  v128_t \
 	b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
 	b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
 	b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
@@ -53,9 +53,9 @@ do { \
 
 #define BUPDATE1_4W( n0, n2 ) \
 do { \
-   sc->buffer[ptr24][n0] = _mm_xor_si128( sc->buffer[ptr24][n0], \
+   sc->buffer[ptr24][n0] = v128_xor( sc->buffer[ptr24][n0], \
                                           sc->buffer[ptr31][n2] ); \
-   sc->buffer[ptr31][n2] = _mm_xor_si128( sc->buffer[ptr31][n2], INW1(n2) ); \
+   sc->buffer[ptr31][n2] = v128_xor( sc->buffer[ptr31][n2], INW1(n2) ); \
 } while (0)
 
 #define BUPDATE_4W \
@@ -71,50 +71,50 @@ do { \
 } while (0)
 
 #define GAMMA_4W(n0, n1, n2, n4)   \
-   (g ## n0 = _mm_xor_si128( a ## n0, \
-                             _mm_or_si128( a ## n1, mm128_not( a ## n2 ) ) ) )
+   (g ## n0 = v128_xor( a ## n0, \
+                             v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
 
 #define PI_ALL_4W   do { \
       a0  = g0; \
-      a1  = mm128_rol_32( g7,   1 ); \
-      a2  = mm128_rol_32( g14,  3 ); \
-      a3  = mm128_rol_32( g4,   6 ); \
-      a4  = mm128_rol_32( g11, 10 ); \
-      a5  = mm128_rol_32( g1,  15 ); \
-      a6  = mm128_rol_32( g8,  21 ); \
-      a7  = mm128_rol_32( g15, 28 ); \
-      a8  = mm128_rol_32( g5,   4 ); \
-      a9  = mm128_rol_32( g12, 13 ); \
-      a10 = mm128_rol_32( g2,  23 ); \
-      a11 = mm128_rol_32( g9,   2 ); \
-      a12 = mm128_rol_32( g16, 14 ); \
-      a13 = mm128_rol_32( g6,  27 ); \
-      a14 = mm128_rol_32( g13,  9 ); \
-      a15 = mm128_rol_32( g3,  24 ); \
-      a16 = mm128_rol_32( g10,  8 ); \
+      a1  = v128_rol32( g7,   1 ); \
+      a2  = v128_rol32( g14,  3 ); \
+      a3  = v128_rol32( g4,   6 ); \
+      a4  = v128_rol32( g11, 10 ); \
+      a5  = v128_rol32( g1,  15 ); \
+      a6  = v128_rol32( g8,  21 ); \
+      a7  = v128_rol32( g15, 28 ); \
+      a8  = v128_rol32( g5,   4 ); \
+      a9  = v128_rol32( g12, 13 ); \
+      a10 = v128_rol32( g2,  23 ); \
+      a11 = v128_rol32( g9,   2 ); \
+      a12 = v128_rol32( g16, 14 ); \
+      a13 = v128_rol32( g6,  27 ); \
+      a14 = v128_rol32( g13,  9 ); \
+      a15 = v128_rol32( g3,  24 ); \
+      a16 = v128_rol32( g10,  8 ); \
    } while (0)
 
 #define THETA_4W(n0, n1, n2, n4)   \
-   ( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
+   ( g ## n0 = v128_xor( a ## n0, v128_xor( a ## n1, a ## n4 ) ) )
 
 #define SIGMA_ALL_4W   do { \
-		a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \
-		a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
-		a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
-		a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
-		a4 = _mm_xor_si128( g4, INW2( 3 ) ); \
-		a5 = _mm_xor_si128( g5, INW2( 4 ) ); \
-		a6 = _mm_xor_si128( g6, INW2( 5 ) ); \
-		a7 = _mm_xor_si128( g7, INW2( 6 ) ); \
-		a8 = _mm_xor_si128( g8, INW2( 7 ) ); \
-		a9  = _mm_xor_si128( g9,  sc->buffer[ ptr16 ][0] ); \
-		a10 = _mm_xor_si128( g10, sc->buffer[ ptr16 ][1] ); \
-		a11 = _mm_xor_si128( g11, sc->buffer[ ptr16 ][2] ); \
-		a12 = _mm_xor_si128( g12, sc->buffer[ ptr16 ][3] ); \
-		a13 = _mm_xor_si128( g13, sc->buffer[ ptr16 ][4] ); \
-		a14 = _mm_xor_si128( g14, sc->buffer[ ptr16 ][5] ); \
-		a15 = _mm_xor_si128( g15, sc->buffer[ ptr16 ][6] ); \
-		a16 = _mm_xor_si128( g16, sc->buffer[ ptr16 ][7] ); \
+		a0 = v128_xor( g0, v128_32( 1 ) ); \
+		a1 = v128_xor( g1, INW2( 0 ) ); \
+		a2 = v128_xor( g2, INW2( 1 ) ); \
+		a3 = v128_xor( g3, INW2( 2 ) ); \
+		a4 = v128_xor( g4, INW2( 3 ) ); \
+		a5 = v128_xor( g5, INW2( 4 ) ); \
+		a6 = v128_xor( g6, INW2( 5 ) ); \
+		a7 = v128_xor( g7, INW2( 6 ) ); \
+		a8 = v128_xor( g8, INW2( 7 ) ); \
+		a9  = v128_xor( g9,  sc->buffer[ ptr16 ][0] ); \
+		a10 = v128_xor( g10, sc->buffer[ ptr16 ][1] ); \
+		a11 = v128_xor( g11, sc->buffer[ ptr16 ][2] ); \
+		a12 = v128_xor( g12, sc->buffer[ ptr16 ][3] ); \
+		a13 = v128_xor( g13, sc->buffer[ ptr16 ][4] ); \
+		a14 = v128_xor( g14, sc->buffer[ ptr16 ][5] ); \
+		a15 = v128_xor( g15, sc->buffer[ ptr16 ][6] ); \
+		a16 = v128_xor( g16, sc->buffer[ ptr16 ][7] ); \
 	} while (0)
 
 #define PANAMA_STEP_4W   do { \
@@ -138,7 +138,7 @@ panama_4way_push( panama_4way_context *sc, const unsigned char *pbuf,
 	LVARS_4W
 	unsigned ptr0;
 
-#define INW1(i)   casti_m128i( pbuf, i )
+#define INW1(i)   casti_v128( pbuf, i )
 #define INW2(i)   INW1(i)
 
 	M17( RSTATE );
@@ -167,7 +167,7 @@ panama_4way_pull( panama_4way_context *sc, unsigned num )
 #define INW1(i)     INW_H1(INC ## i)
 #define INW_H1(i)   INW_H2(i)
 #define INW_H2(i)   a ## i
-#define INW2(i)     casti_m128i( sc->buffer[ptr4], i )
+#define INW2(i)     casti_v128( sc->buffer[ptr4], i )
 
 	M17( RSTATE );
    ptr0 = sc->buffer_ptr;
@@ -254,7 +254,7 @@ panama_4way_update( void *cc, const void *data, size_t len )
 
    rlen = len & 31;
 	if ( rlen > 0 )
-      memcpy_128( (__m128i*)sc->data, (__m128i*)data  + len - rlen, rlen );
+      v128_memcpy( (v128_t*)sc->data, (v128_t*)data  + len - rlen, rlen );
 
 	sc->data_ptr = rlen;
 }
@@ -268,13 +268,13 @@ panama_4way_close( void *cc, void *dst )
 
 	sc = cc;
 	current = sc->data_ptr;
-	*(__m128i*)( sc->data + current ) = v128_32( 1 );
+	*(v128_t*)( sc->data + current ) = v128_32( 1 );
    current++;
-   memset_zero_128( (__m128i*)sc->data + current, 32 - current );
+   v128_memset_zero( (v128_t*)sc->data + current, 32 - current );
    panama_4way_push( sc, sc->data, 1 );
    panama_4way_pull( sc, 32 );
    for ( i = 0; i < 8; i ++ )
-      casti_m128i( dst, i ) = sc->state[i + 9];
+      casti_v128( dst, i ) = sc->state[i + 9];
 }
 
 
diff --git a/algo/panama/panama-hash-4way.h b/algo/panama/panama-hash-4way.h
index 21eede8..4af7442 100644
--- a/algo/panama/panama-hash-4way.h
+++ b/algo/panama/panama-hash-4way.h
@@ -11,8 +11,8 @@
 
 typedef struct {
    unsigned char data[32<<2];
-   __m128i buffer[32][8];
-   __m128i state[17];
+   v128_t buffer[32][8];
+   v128_t state[17];
    unsigned data_ptr;
    unsigned buffer_ptr;
 } panama_4way_context __attribute__ ((aligned (64)));
diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c
index 230a291..c80cba5 100644
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -56,21 +56,20 @@ void deep_hash(void *output, const void *input)
         const int midlen = 64;            // bytes
         const int tail   = 80 - midlen;   // 16
         memcpy( &ctx.luffa, &deep_luffa_mid, sizeof deep_luffa_mid );
-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, 
-                                (const BitSequence*)input + midlen, tail );
+        update_and_final_luffa( &ctx.luffa, hash, 
+                                input + midlen, tail );
 
-        cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, 
-                              (const byte*) hash,64);
+        cubehashUpdateDigest( &ctx.cubehash, hash, 
+                               hash,64);
 
 #ifdef __AES__
-        update_final_echo ( &ctx.echo, (BitSequence *) hash,
-                          (const BitSequence *) hash, 512);
+        update_final_echo ( &ctx.echo,  hash,
+                           hash, 512);
 #else
         sph_echo512 (&ctx.echo, (const void*) hash, 64);
         sph_echo512_close(&ctx.echo, (void*) hash);
 #endif
 
-        asm volatile ("emms");
         memcpy(output, hash, 32);
 }
 
diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c
index 38b72ee..976bb9f 100644
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -82,7 +82,6 @@ void qubit_hash(void *output, const void *input)
         sph_echo512_close(&ctx.echo, (void*) hash);
 #endif
 
-        asm volatile ("emms");
         memcpy(output, hash, 32);
 }
 
diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c
index e91b287..bfd51fa 100644
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -8,6 +8,7 @@
 #include <stdio.h>
 #include "sph_ripemd.h"
 #include "algo/sha/sha256-hash.h"
+#include "algo/sha/sha512-hash.h"
 
 void lbry_hash(void* output, const void* input)
 {
diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c
index c08b44e..37c35f8 100644
--- a/algo/scrypt/scrypt-core-4way.c
+++ b/algo/scrypt/scrypt-core-4way.c
@@ -197,99 +197,99 @@ do{ \
 do{ \
    TYPE TA = ADD32( XA0, XA3 ); \
    TYPE TB = ADD32( XB0, XB3 ); \
-   TYPE T  = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   TYPE T  = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
    XA1 = XOR( XA1, T  ); \
    XA1 = XOR( XA1, TA  ); \
-   T = _mm_slli_epi32( TB, 7 );\
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 );\
+   TB = v128_sr32( TB, 25 ); \
    XB1 = XOR( XB1, T ); \
    XB1 = XOR( XB1, TB ); \
 \
    TA = ADD32( XA1, XA0 ); \
    TB = ADD32( XB1, XB0 ); \
-   T  = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T  = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
    XA2 = XOR( XA2, T ); \
    XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 );\
-   TB = _mm_srli_epi32( TB, 23 );\
+   T = v128_sl32( TB, 9 );\
+   TB = v128_sr32( TB, 23 );\
    XB2 = XOR( XB2, T ); \
    XB2 = XOR( XB2, TB ); \
 \
    TA = ADD32( XA2, XA1 ); \
    TB = ADD32( XB2, XB1 ); \
-   T  = _mm_slli_epi32( TA, 13); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T  = v128_sl32( TA, 13); \
+   TA = v128_sr32( TA, 19 ); \
    XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
    XA3 = XOR( XA3, T ); \
    XA3 = XOR( XA3, TA ); \
-   T  = _mm_slli_epi32( TB, 13); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T  = v128_sl32( TB, 13); \
+   TB = v128_sr32( TB, 19 ); \
    XB3 = XOR( XB3, T ); \
    XB3 = XOR( XB3, TB ); \
 \
    TA = ADD32( XA3, XA2 ); \
    TB = ADD32( XB3, XB2 ); \
-   T  = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T  = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
    XA0 = XOR( XA0, T ); \
    XA0 = XOR( XA0, TA ); \
-   T  = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T  = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
    XB0 = XOR( XB0, T ); \
    XB0 = XOR( XB0, TB ); \
 \
    TA = ADD32( XA0, XA1 ); \
    TB = ADD32( XB0, XB1 ); \
-   T = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   T = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
    XA3 = ROR_1X32( XA3 ); \
    XA3 = XOR( XA3, T ); \
    XA3 = XOR( XA3, TA ); \
-   T = _mm_slli_epi32( TB, 7 ); \
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 ); \
+   TB = v128_sr32( TB, 25 ); \
    XB3 = ROR_1X32( XB3 ); \
    XB3 = XOR( XB3, T ); \
    XB3 = XOR( XB3, TB ); \
 \
    TA = ADD32( XA3, XA0 ); \
    TB = ADD32( XB3, XB0 ); \
-   T = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
    XA2 = XOR( XA2, T ); \
    XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 ); \
-   TB = _mm_srli_epi32( TB, 23 ); \
+   T = v128_sl32( TB, 9 ); \
+   TB = v128_sr32( TB, 23 ); \
    XB2 = XOR( XB2, T ); \
    XB2 = XOR( XB2, TB ); \
 \
    TA = ADD32( XA2, XA3 ); \
    TB = ADD32( XB2, XB3 ); \
-   T = _mm_slli_epi32( TA, 13 ); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T = v128_sl32( TA, 13 ); \
+   TA = v128_sr32( TA, 19 ); \
    XA3 = ROL_1X32( XA3 ); \
    XB3 = ROL_1X32( XB3 ); \
    XA1 = XOR( XA1, T ); \
    XA1 = XOR( XA1, TA ); \
-   T = _mm_slli_epi32( TB, 13 ); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T = v128_sl32( TB, 13 ); \
+   TB = v128_sr32( TB, 19 ); \
    XB1 = XOR( XB1, T ); \
    XB1 = XOR( XB1, TB ); \
 \
    TA = ADD32( XA1, XA2 ); \
    TB = ADD32( XB1, XB2 ); \
-   T = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
    XA0 = XOR( XA0, T ); \
    XA0 = XOR( XA0, TA ); \
-   T = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
    XA1 = ROR_1X32( XA1 ); \
    XB0 = XOR( XB0, T ); \
    XB0 = XOR( XB0, TB ); \
@@ -423,88 +423,88 @@ do{ \
    TYPE TA = ADD32( XA0, XA3 ); \
    TYPE TB = ADD32( XB0, XB3 ); \
    TYPE TC = ADD32( XC0, XC3 ); \
-   TYPE T  = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   TYPE T  = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
    XA1 = XOR( XA1, T  ); \
    XA1 = XOR( XA1, TA  ); \
-   T = _mm_slli_epi32( TB, 7 );\
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 );\
+   TB = v128_sr32( TB, 25 ); \
    XB1 = XOR( XB1, T ); \
    XB1 = XOR( XB1, TB ); \
-   T = _mm_slli_epi32( TC, 7 );\
-   TC = _mm_srli_epi32( TC, 25 );\
+   T = v128_sl32( TC, 7 );\
+   TC = v128_sr32( TC, 25 );\
    XC1 = XOR( XC1, T ); \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA0 ); \
    TB = ADD32( XB1, XB0 ); \
    TC = ADD32( XC1, XC0 ); \
-   T  = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T  = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
    XA2 = XOR( XA2, T ); \
    XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 );\
-   TB = _mm_srli_epi32( TB, 23 );\
+   T = v128_sl32( TB, 9 );\
+   TB = v128_sr32( TB, 23 );\
    XB2 = XOR( XB2, T ); \
    XB2 = XOR( XB2, TB ); \
-   T = _mm_slli_epi32( TC, 9 );\
-   TC = _mm_srli_epi32( TC, 23 );\
+   T = v128_sl32( TC, 9 );\
+   TC = v128_sr32( TC, 23 );\
    XC2 = XOR( XC2, T ); \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA1 ); \
    TB = ADD32( XB2, XB1 ); \
    TC = ADD32( XC2, XC1 ); \
-   T  = _mm_slli_epi32( TA, 13); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T  = v128_sl32( TA, 13); \
+   TA = v128_sr32( TA, 19 ); \
    XA1 = ROL_1X32( XA1 ); \
    XB1 = ROL_1X32( XB1 ); \
    XC1 = ROL_1X32( XC1 ); \
    XA3 = XOR( XA3, T ); \
    XA3 = XOR( XA3, TA ); \
-   T  = _mm_slli_epi32( TB, 13); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T  = v128_sl32( TB, 13); \
+   TB = v128_sr32( TB, 19 ); \
    XB3 = XOR( XB3, T ); \
    XB3 = XOR( XB3, TB ); \
-   T  = _mm_slli_epi32( TC, 13); \
-   TC = _mm_srli_epi32( TC, 19 ); \
+   T  = v128_sl32( TC, 13); \
+   TC = v128_sr32( TC, 19 ); \
    XC3 = XOR( XC3, T ); \
    XC3 = XOR( XC3, TC ); \
 \
    TA = ADD32( XA3, XA2 ); \
    TB = ADD32( XB3, XB2 ); \
    TC = ADD32( XC3, XC2 ); \
-   T  = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T  = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
    XC2 = SWAP_64( XC2 ); \
    XA0 = XOR( XA0, T ); \
    XA0 = XOR( XA0, TA ); \
-   T  = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T  = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
    XB0 = XOR( XB0, T ); \
    XB0 = XOR( XB0, TB ); \
-   T = _mm_slli_epi32( TC, 18 ); \
-   TC = _mm_srli_epi32( TC, 14 ); \
+   T = v128_sl32( TC, 18 ); \
+   TC = v128_sr32( TC, 14 ); \
    XC0 = XOR( XC0, T ); \
    XC0 = XOR( XC0, TC ); \
 \
    TA = ADD32( XA0, XA1 ); \
    TB = ADD32( XB0, XB1 ); \
    TC = ADD32( XC0, XC1 ); \
-   T = _mm_slli_epi32( TA, 7 ); \
-   TA = _mm_srli_epi32( TA, 25 ); \
+   T = v128_sl32( TA, 7 ); \
+   TA = v128_sr32( TA, 25 ); \
    XA3 = ROR_1X32( XA3 ); \
    XA3 = XOR( XA3, T ); \
    XA3 = XOR( XA3, TA ); \
-   T = _mm_slli_epi32( TB, 7 ); \
-   TB = _mm_srli_epi32( TB, 25 ); \
+   T = v128_sl32( TB, 7 ); \
+   TB = v128_sr32( TB, 25 ); \
    XB3 = ROR_1X32( XB3 ); \
    XB3 = XOR( XB3, T ); \
    XB3 = XOR( XB3, TB ); \
-   T = _mm_slli_epi32( TC, 7 ); \
-   TC = _mm_srli_epi32( TC, 25 ); \
+   T = v128_sl32( TC, 7 ); \
+   TC = v128_sr32( TC, 25 ); \
    XC3 = ROR_1X32( XC3 ); \
    XC3 = XOR( XC3, T ); \
    XC3 = XOR( XC3, TC ); \
@@ -512,55 +512,55 @@ do{ \
    TA = ADD32( XA3, XA0 ); \
    TB = ADD32( XB3, XB0 ); \
    TC = ADD32( XC3, XC0 ); \
-   T = _mm_slli_epi32( TA, 9 ); \
-   TA = _mm_srli_epi32( TA, 23 ); \
+   T = v128_sl32( TA, 9 ); \
+   TA = v128_sr32( TA, 23 ); \
    XA2 = XOR( XA2, T ); \
    XA2 = XOR( XA2, TA ); \
-   T = _mm_slli_epi32( TB, 9 ); \
-   TB = _mm_srli_epi32( TB, 23 ); \
+   T = v128_sl32( TB, 9 ); \
+   TB = v128_sr32( TB, 23 ); \
    XB2 = XOR( XB2, T ); \
    XB2 = XOR( XB2, TB ); \
-   T = _mm_slli_epi32( TC, 9 ); \
-   TC = _mm_srli_epi32( TC, 23 ); \
+   T = v128_sl32( TC, 9 ); \
+   TC = v128_sr32( TC, 23 ); \
    XC2 = XOR( XC2, T ); \
    XC2 = XOR( XC2, TC ); \
 \
    TA = ADD32( XA2, XA3 ); \
    TB = ADD32( XB2, XB3 ); \
    TC = ADD32( XC2, XC3 ); \
-   T = _mm_slli_epi32( TA, 13 ); \
-   TA = _mm_srli_epi32( TA, 19 ); \
+   T = v128_sl32( TA, 13 ); \
+   TA = v128_sr32( TA, 19 ); \
    XA3 = ROL_1X32( XA3 ); \
    XB3 = ROL_1X32( XB3 ); \
    XC3 = ROL_1X32( XC3 ); \
    XA1 = XOR( XA1, T ); \
    XA1 = XOR( XA1, TA ); \
-   T = _mm_slli_epi32( TB, 13 ); \
-   TB = _mm_srli_epi32( TB, 19 ); \
+   T = v128_sl32( TB, 13 ); \
+   TB = v128_sr32( TB, 19 ); \
    XB1 = XOR( XB1, T ); \
    XB1 = XOR( XB1, TB ); \
-   T = _mm_slli_epi32( TC, 13 ); \
-   TC = _mm_srli_epi32( TC, 19 ); \
+   T = v128_sl32( TC, 13 ); \
+   TC = v128_sr32( TC, 19 ); \
    XC1 = XOR( XC1, T ); \
    XC1 = XOR( XC1, TC ); \
 \
    TA = ADD32( XA1, XA2 ); \
    TB = ADD32( XB1, XB2 ); \
    TC = ADD32( XC1, XC2 ); \
-   T = _mm_slli_epi32( TA, 18 ); \
-   TA = _mm_srli_epi32( TA, 14 ); \
+   T = v128_sl32( TA, 18 ); \
+   TA = v128_sr32( TA, 14 ); \
    XA2 = SWAP_64( XA2 ); \
    XB2 = SWAP_64( XB2 ); \
    XA0 = XOR( XA0, T ); \
    XA0 = XOR( XA0, TA ); \
-   T = _mm_slli_epi32( TB, 18 ); \
-   TB = _mm_srli_epi32( TB, 14 ); \
+   T = v128_sl32( TB, 18 ); \
+   TB = v128_sr32( TB, 14 ); \
    XC2 = SWAP_64( XC2 ); \
    XA1 = ROR_1X32( XA1 ); \
    XB0 = XOR( XB0, T ); \
    XB0 = XOR( XB0, TB ); \
-   T = _mm_slli_epi32( TC, 18 ); \
-   TC = _mm_srli_epi32( TC, 14 ); \
+   T = v128_sl32( TC, 18 ); \
+   TC = v128_sr32( TC, 14 ); \
    XB1 = ROR_1X32( XB1 ); \
    XC1 = ROR_1X32( XC1 ); \
    XC0 = XOR( XC0, T ); \
@@ -832,7 +832,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
 
 // Working, not up to date, needs stream, shuffle optimizations.
 // 4x32 interleaving
-static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
+static void salsa8_simd128_4way( v128_t *b, const v128_t *c )
 {
    __m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
    __m512i *B = (__m512i*)b; 
@@ -902,7 +902,7 @@ static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
 // { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2,
 //   l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 }
 
-void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
+void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N )
 {
    for ( int n = 0; n < N; n++ )
    {
@@ -923,7 +923,7 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
 
       for( int i = 0; i < 32; i++ )
       {
-         X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3],
+         X[i] = v128_xor( X[i], v128_set_32( v[ x16[3] + i ].u32[3],
                                                     v[ x16[2] + i ].u32[2],
                                                     v[ x16[1] + i ].u32[1],
                                                     v[ x16[0] + i ].u32[0] ) );
@@ -2003,28 +2003,28 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
 // Scrypt 2x faster than pooler
 // 4x memory usage
 // 4x32 interleaving
-static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
+static void xor_salsa8_4way( v128_t * const B, const v128_t * const C )
 {
-   __m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] );
-   __m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] );
-   __m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] );
-   __m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] );
-   __m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] );
-   __m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] );
-   __m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] );
-   __m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] );
-   __m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] );
-   __m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] );
-   __m128i xa = B[10] = _mm_xor_si128( B[10], C[10] );
-   __m128i xb = B[11] = _mm_xor_si128( B[11], C[11] );
-   __m128i xc = B[12] = _mm_xor_si128( B[12], C[12] );
-   __m128i xd = B[13] = _mm_xor_si128( B[13], C[13] );
-   __m128i xe = B[14] = _mm_xor_si128( B[14], C[14] );
-   __m128i xf = B[15] = _mm_xor_si128( B[15], C[15] );
+   v128_t x0 = B[ 0] = v128_xor( B[ 0], C[ 0] );
+   v128_t x1 = B[ 1] = v128_xor( B[ 1], C[ 1] );
+   v128_t x2 = B[ 2] = v128_xor( B[ 2], C[ 2] );
+   v128_t x3 = B[ 3] = v128_xor( B[ 3], C[ 3] );
+   v128_t x4 = B[ 4] = v128_xor( B[ 4], C[ 4] );
+   v128_t x5 = B[ 5] = v128_xor( B[ 5], C[ 5] );
+   v128_t x6 = B[ 6] = v128_xor( B[ 6], C[ 6] );
+   v128_t x7 = B[ 7] = v128_xor( B[ 7], C[ 7] );
+   v128_t x8 = B[ 8] = v128_xor( B[ 8], C[ 8] );
+   v128_t x9 = B[ 9] = v128_xor( B[ 9], C[ 9] );
+   v128_t xa = B[10] = v128_xor( B[10], C[10] );
+   v128_t xb = B[11] = v128_xor( B[11], C[11] );
+   v128_t xc = B[12] = v128_xor( B[12], C[12] );
+   v128_t xd = B[13] = v128_xor( B[13], C[13] );
+   v128_t xe = B[14] = v128_xor( B[14], C[14] );
+   v128_t xf = B[15] = v128_xor( B[15], C[15] );
 
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor
 
    SALSA_8ROUNDS;
 
@@ -2032,25 +2032,25 @@ static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
    #undef ADD32
    #undef XOR 
 
-   B[ 0] = _mm_add_epi32( B[ 0], x0 );
-   B[ 1] = _mm_add_epi32( B[ 1], x1 );
-   B[ 2] = _mm_add_epi32( B[ 2], x2 );
-   B[ 3] = _mm_add_epi32( B[ 3], x3 );
-   B[ 4] = _mm_add_epi32( B[ 4], x4 );
-   B[ 5] = _mm_add_epi32( B[ 5], x5 );
-   B[ 6] = _mm_add_epi32( B[ 6], x6 );
-   B[ 7] = _mm_add_epi32( B[ 7], x7 );
-   B[ 8] = _mm_add_epi32( B[ 8], x8 );
-   B[ 9] = _mm_add_epi32( B[ 9], x9 );
-   B[10] = _mm_add_epi32( B[10], xa );
-   B[11] = _mm_add_epi32( B[11], xb );
-   B[12] = _mm_add_epi32( B[12], xc );
-   B[13] = _mm_add_epi32( B[13], xd );
-   B[14] = _mm_add_epi32( B[14], xe );
-   B[15] = _mm_add_epi32( B[15], xf );
+   B[ 0] = v128_add32( B[ 0], x0 );
+   B[ 1] = v128_add32( B[ 1], x1 );
+   B[ 2] = v128_add32( B[ 2], x2 );
+   B[ 3] = v128_add32( B[ 3], x3 );
+   B[ 4] = v128_add32( B[ 4], x4 );
+   B[ 5] = v128_add32( B[ 5], x5 );
+   B[ 6] = v128_add32( B[ 6], x6 );
+   B[ 7] = v128_add32( B[ 7], x7 );
+   B[ 8] = v128_add32( B[ 8], x8 );
+   B[ 9] = v128_add32( B[ 9], x9 );
+   B[10] = v128_add32( B[10], xa );
+   B[11] = v128_add32( B[11], xb );
+   B[12] = v128_add32( B[12], xc );
+   B[13] = v128_add32( B[13], xd );
+   B[14] = v128_add32( B[14], xe );
+   B[15] = v128_add32( B[15], xf );
 }
 
-void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
+void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
 {
    for ( int n = 0; n < N; n++ )
    {
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
          m128_ovly v;    
          for ( int l = 0; l < 4; l++ )
             v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
-         X[i] = _mm_xor_si128( X[i], v.m128 );
+         X[i] = v128_xor( X[i], v.m128 );
       }
 
       xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2095,27 +2095,27 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
 // No interleaving
 static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
 {
-   __m128i X0, X1, X2, X3;
-   __m128i *B = (__m128i*)b;
-   const __m128i *C = (const __m128i*)c;
+   v128_t X0, X1, X2, X3;
+   v128_t *B = (v128_t*)b;
+   const v128_t *C = (const v128_t*)c;
 
    // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
+   #define ROL_1X32    v128_shufll32
+   #define ROR_1X32    v128_shuflr32
+   #define SWAP_64     v128_swap64
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor
    
    // mix C into B then shuffle B into X
-   B[0] = _mm_xor_si128( B[0], C[0] );
-   B[1] = _mm_xor_si128( B[1], C[1] );
-   B[2] = _mm_xor_si128( B[2], C[2] );
-   B[3] = _mm_xor_si128( B[3], C[3] );
+   B[0] = v128_xor( B[0], C[0] );
+   B[1] = v128_xor( B[1], C[1] );
+   B[2] = v128_xor( B[2], C[2] );
+   B[3] = v128_xor( B[3], C[3] );
 
 #if defined(__SSE4_1__)
 
-   __m128i Y0, Y1, Y2, Y3;
+   v128_t Y0, Y1, Y2, Y3;
 
 #if defined(__AVX2__)
    
@@ -2188,19 +2188,19 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
 
 #endif   // AVX2 else SSE4_1
 
-   B[0] = _mm_add_epi32( B[0], Y0 );
-   B[1] = _mm_add_epi32( B[1], Y1 );
-   B[2] = _mm_add_epi32( B[2], Y2 );
-   B[3] = _mm_add_epi32( B[3], Y3 );
+   B[0] = v128_add32( B[0], Y0 );
+   B[1] = v128_add32( B[1], Y1 );
+   B[2] = v128_add32( B[2], Y2 );
+   B[3] = v128_add32( B[3], Y3 );
 
 #else  // SSE2
 
    m128_ovly y[4], z[4];
 
-   X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] );
-   X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] );
-   X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] );
-   X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] );
+   X0 = v128_set_32( b[15], b[10], b[ 5], b[ 0] );
+   X1 = v128_set_32( b[ 3], b[14], b[ 9], b[ 4] );
+   X2 = v128_set_32( b[ 7], b[ 2], b[13], b[ 8] );
+   X3 = v128_set_32( b[11], b[ 6], b[ 1], b[12] );
    
    SALSA_8ROUNDS_FINAL_SIMD128;
 
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
    z[3].u32[1] = y[2].u32[3];
    z[3].u32[0] = y[3].u32[3];
 
-   B[0] = _mm_add_epi32( B[0], z[0].m128 );
-   B[1] = _mm_add_epi32( B[1], z[1].m128 );
-   B[2] = _mm_add_epi32( B[2], z[2].m128 );
-   B[3] = _mm_add_epi32( B[3], z[3].m128 );
+   B[0] = v128_add32( B[0], z[0].m128 );
+   B[1] = v128_add32( B[1], z[1].m128 );
+   B[2] = v128_add32( B[2], z[2].m128 );
+   B[3] = v128_add32( B[3], z[3].m128 );
 
 #endif
 
@@ -2257,7 +2257,7 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
    for ( int n = 0; n < N; n++ )
    {
       for ( int i = 0; i < 8; i++ )
-         _mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) );
+         _mm_stream_si128( (v128_t*)V + n*8 + i, casti_v128( X, i ) );
 
       salsa8_simd128( &X[ 0], &X[16] );
       salsa8_simd128( &X[16], &X[ 0] );
@@ -2277,15 +2277,15 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
 
 static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 {
-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;
 
 #if defined(__SSE4_1__)
 
-  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
-  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
-  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
   XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
   XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
   XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
@@ -2301,16 +2301,16 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 
 #else   // SSE2
 
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
+   v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
    
-   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
-   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
-   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
-   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
-   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
-   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
-   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
-   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
+   YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] );
 
    XA[0] = YA0;
    XB[0] = YB0;
@@ -2327,15 +2327,15 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
 static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 {
 
-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;
 
 #if defined(__SSE4_1__)
 
-  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
-  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
-  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
   XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
   XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
   XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
@@ -2413,29 +2413,29 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
 static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
                        const uint32_t * const ca, const uint32_t * const cb )
 {
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
+   v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
+   v128_t *BA = (v128_t*)ba;
+   v128_t *BB = (v128_t*)bb;
+   const v128_t *CA = (const v128_t*)ca;
+   const v128_t *CB = (const v128_t*)cb;
 
    // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
+   #define ROL_1X32    v128_shufll32
+   #define ROR_1X32    v128_shuflr32
+   #define SWAP_64     v128_swap64
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor
+   #define TYPE        v128_t
 
-   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
+   XA0 = BA[0] = v128_xor( BA[0], CA[0] );
+   XB0 = BB[0] = v128_xor( BB[0], CB[0] );
+   XA1 = BA[1] = v128_xor( BA[1], CA[1] );
+   XB1 = BB[1] = v128_xor( BB[1], CB[1] );
+   XA2 = BA[2] = v128_xor( BA[2], CA[2] );
+   XB2 = BB[2] = v128_xor( BB[2], CB[2] );
+   XA3 = BA[3] = v128_xor( BA[3], CA[3] );
+   XB3 = BB[3] = v128_xor( BB[3], CB[3] );
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
              
@@ -2447,14 +2447,14 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
    
 #endif
 
-   BA[0] = _mm_add_epi32( BA[0], XA0 );
-   BB[0] = _mm_add_epi32( BB[0], XB0 );
-   BA[1] = _mm_add_epi32( BA[1], XA1 );
-   BB[1] = _mm_add_epi32( BB[1], XB1 );
-   BA[2] = _mm_add_epi32( BA[2], XA2 );
-   BB[2] = _mm_add_epi32( BB[2], XB2 );
-   BA[3] = _mm_add_epi32( BA[3], XA3 );
-   BB[3] = _mm_add_epi32( BB[3], XB3 );
+   BA[0] = v128_add32( BA[0], XA0 );
+   BB[0] = v128_add32( BB[0], XB0 );
+   BA[1] = v128_add32( BA[1], XA1 );
+   BB[1] = v128_add32( BB[1], XB1 );
+   BA[2] = v128_add32( BA[2], XA2 );
+   BB[2] = v128_add32( BB[2], XB2 );
+   BA[3] = v128_add32( BA[3], XA3 );
+   BB[3] = v128_add32( BB[3], XB3 );
 
    #undef ROL_1X32
    #undef ROR_1X32
@@ -2489,8 +2489,8 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 
       for ( int i = 0; i < 8; i++ )
       {
-         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
-         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
+         _mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) );
+         _mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) );
       }
 
    #else
@@ -2535,10 +2535,10 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
       const int j1 = 8 * ( X1[16] & ( N-1 ) );
       for ( int i = 0; i < 8; i++ )
       {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
-         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
-         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
+         const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i );
+         const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i );
+         casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 );
+         casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 );
       }
 
    #endif
@@ -2555,16 +2555,16 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
 static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
                                         uint32_t *xc )
 {
-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
-   __m128i *XC = (__m128i*)xc;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;
+   v128_t *XC = (v128_t*)xc;
 
 #if defined(__SSE4_1__)
 
-  __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
-  __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
-  __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
+  v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
+  v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
   XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
   XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
   XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
@@ -2588,20 +2588,20 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
 
 #else   // SSE2
 
-   __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
+   v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
 
-   YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
-   YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
-   YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
-   YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
-   YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
-   YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
-   YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
-   YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
-   YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
-   YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
-   YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
-   YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] );
+   YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] );
+   YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] );
+   YC0 = v128_set_32( xc[15], xc[10], xc[ 5], xc[ 0] );
+   YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
+   YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
+   YC1 = v128_set_32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
+   YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
+   YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
+   YC2 = v128_set_32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
+   YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] );
+   YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] );
+   YC3 = v128_set_32( xc[11], xc[ 6], xc[ 1], xc[12] );
 
    XA[0] = YA0;
    XB[0] = YB0;
@@ -2622,16 +2622,16 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
 static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
                                           uint32_t* xc )
 {
-   __m128i *XA = (__m128i*)xa;
-   __m128i *XB = (__m128i*)xb;
-   __m128i *XC = (__m128i*)xc;
+   v128_t *XA = (v128_t*)xa;
+   v128_t *XB = (v128_t*)xb;
+   v128_t *XC = (v128_t*)xc;
 
 #if defined(__SSE4_1__)
 
-  __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
-  __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
-  __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
-  __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
+  v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
+  v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
+  v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
+  v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
   XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
   XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
   XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
@@ -2743,36 +2743,36 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
 static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
                const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
 {
-   __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
+   v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
            XC0, XC1, XC2, XC3;
-   __m128i *BA = (__m128i*)ba;
-   __m128i *BB = (__m128i*)bb;
-   __m128i *BC = (__m128i*)bc;
-   const __m128i *CA = (const __m128i*)ca;
-   const __m128i *CB = (const __m128i*)cb;
-   const __m128i *CC = (const __m128i*)cc;
+   v128_t *BA = (v128_t*)ba;
+   v128_t *BB = (v128_t*)bb;
+   v128_t *BC = (v128_t*)bc;
+   const v128_t *CA = (const v128_t*)ca;
+   const v128_t *CB = (const v128_t*)cb;
+   const v128_t *CC = (const v128_t*)cc;
 
    // define targets for macros used in round function template
-   #define ROL_1X32    mm128_shufll_32
-   #define ROR_1X32    mm128_shuflr_32
-   #define SWAP_64     mm128_swap_64
-   #define ROL32       mm128_rol_32
-   #define ADD32       _mm_add_epi32
-   #define XOR         _mm_xor_si128
-   #define TYPE        __m128i
+   #define ROL_1X32    v128_shufll32
+   #define ROR_1X32    v128_shuflr32
+   #define SWAP_64     v128_swap64
+   #define ROL32       v128_rol32
+   #define ADD32       v128_add32
+   #define XOR         v128_xor
+   #define TYPE        v128_t
 
-   XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
-   XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
-   XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] );
-   XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
-   XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
-   XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] );
-   XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
-   XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
-   XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] );
-   XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
-   XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
-   XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] );
+   XA0 = BA[0] = v128_xor( BA[0], CA[0] );
+   XB0 = BB[0] = v128_xor( BB[0], CB[0] );
+   XC0 = BC[0] = v128_xor( BC[0], CC[0] );
+   XA1 = BA[1] = v128_xor( BA[1], CA[1] );
+   XB1 = BB[1] = v128_xor( BB[1], CB[1] );
+   XC1 = BC[1] = v128_xor( BC[1], CC[1] );
+   XA2 = BA[2] = v128_xor( BA[2], CA[2] );
+   XB2 = BB[2] = v128_xor( BB[2], CB[2] );
+   XC2 = BC[2] = v128_xor( BC[2], CC[2] );
+   XA3 = BA[3] = v128_xor( BA[3], CA[3] );
+   XB3 = BB[3] = v128_xor( BB[3], CB[3] );
+   XC3 = BC[3] = v128_xor( BC[3], CC[3] );
       
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
    
@@ -2784,18 +2784,18 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
 
 #endif
 
-   BA[0] = _mm_add_epi32( BA[0], XA0 );
-   BB[0] = _mm_add_epi32( BB[0], XB0 );
-   BC[0] = _mm_add_epi32( BC[0], XC0 );
-   BA[1] = _mm_add_epi32( BA[1], XA1 );
-   BB[1] = _mm_add_epi32( BB[1], XB1 );
-   BC[1] = _mm_add_epi32( BC[1], XC1 );
-   BA[2] = _mm_add_epi32( BA[2], XA2 );
-   BB[2] = _mm_add_epi32( BB[2], XB2 );
-   BC[2] = _mm_add_epi32( BC[2], XC2 );
-   BA[3] = _mm_add_epi32( BA[3], XA3 );
-   BB[3] = _mm_add_epi32( BB[3], XB3 );
-   BC[3] = _mm_add_epi32( BC[3], XC3 );
+   BA[0] = v128_add32( BA[0], XA0 );
+   BB[0] = v128_add32( BB[0], XB0 );
+   BC[0] = v128_add32( BC[0], XC0 );
+   BA[1] = v128_add32( BA[1], XA1 );
+   BB[1] = v128_add32( BB[1], XB1 );
+   BC[1] = v128_add32( BC[1], XC1 );
+   BA[2] = v128_add32( BA[2], XA2 );
+   BB[2] = v128_add32( BB[2], XB2 );
+   BC[2] = v128_add32( BC[2], XC2 );
+   BA[3] = v128_add32( BA[3], XA3 );
+   BB[3] = v128_add32( BB[3], XB3 );
+   BC[3] = v128_add32( BC[3], XC3 );
 
    #undef ROL_1X32
    #undef ROR_1X32
@@ -2833,9 +2833,9 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
 
       for ( int i = 0; i < 8; i++ )
       {
-         _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
-         _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
-         _mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) );
+         _mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) );
+         _mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) );
+         _mm_stream_si128( (v128_t*)V2 + n*8 + i, casti_v128( X2, i ) );
       }
 
    #else
@@ -2891,12 +2891,12 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
       const int j2 = 8 * ( X2[16] & ( N-1 ) );
       for ( int i = 0; i < 8; i++ )
       {
-         const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
-         const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
-         const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i );
-         casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
-         casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
-         casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 );
+         const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i );
+         const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i );
+         const v128_t v2 = v128_load( ( (v128_t*)V2 ) +j2+i );
+         casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 );
+         casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 );
+         casti_v128( X2, i ) = v128_xor( casti_v128( X2, i ), v2 );
       }
 
    #endif
diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h
index 6567733..709ba67 100644
--- a/algo/scrypt/scrypt-core-4way.h
+++ b/algo/scrypt/scrypt-core-4way.h
@@ -10,7 +10,7 @@
 void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
 
 // Serial SIMD over 4 way parallel
-void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
+void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N );
 
 // 4 way parallel over serial SIMD
 void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
@@ -44,10 +44,8 @@ void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N )
 
 #endif
 
-#if defined(__SSE2__)
-
 // Parallel 4 way, 4x memory
-void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
+void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N );
 
 // Linear SIMD 1 way, 1x memory, lowest
 void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
@@ -61,8 +59,6 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
 // Quadruple buffered, 4x memory
 void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
 
-#endif
-
 // For reference only
 void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );
 
diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c
index b60a5ba..555774b 100644
--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -173,7 +173,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
    memcpy( pad1, key1 + 16, 16 );
    memcpy( pad1 + 4, keypad, 48 );
 
-   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
+   sha256_2x_transform_le( tstate0, tstate1, pad0, pad1,
 		               tstate0, tstate1 );
 
    memcpy( ihash0, tstate0, 32 );
@@ -186,7 +186,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
    }
    for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
 
-   sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
+   sha256_2x_transform_le( ostate0, ostate1, pad0, pad1,
                                sha256_initial_state, sha256_initial_state );
 
    for ( i = 0; i < 8; i++ )
@@ -196,7 +196,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
    }
    for ( ; i < 16; i++ )      pad0[i] = pad1[i] = 0x36363636;
 
-   sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, 
+   sha256_2x_transform_le( tstate0, tstate1, pad0, pad1, 
                                sha256_initial_state, sha256_initial_state );
 }
 
@@ -209,7 +209,7 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
    uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
    int i, j;
 
-   sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
+   sha256_2x_transform_le( istate0, istate1, salt0, salt1,
                                tstate0, tstate1 );
 
    memcpy( ibuf0, salt0 + 16, 16 );
@@ -225,10 +225,10 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
       memcpy( obuf1, istate1, 32 );
       ibuf0[4] = ibuf1[4] = i + 1;
 
-      sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
-                                  obuf0, obuf1 );
-      sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
-                                  ostate0, ostate1 );
+      sha256_2x_transform_le( obuf0, obuf1, ibuf0, ibuf1,
+                              obuf0, obuf1 );
+      sha256_2x_transform_le( ostateb0, ostateb1, obuf0, obuf1,
+                              ostate0, ostate1 );
       
       for ( j = 0; j < 8; j++ )
       {
@@ -246,20 +246,20 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
    uint32_t buf0[16], buf1[16];
    int i;
 
-   sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
-                               tstate0, tstate1 );   
-   sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
-                               tstate0, tstate1 );
-   sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
-                               tstate0, tstate1 );
+   sha256_2x_transform_be( tstate0, tstate1, salt0, salt1,
+                           tstate0, tstate1 );   
+   sha256_2x_transform_be( tstate0, tstate1, salt0+16, salt1+16,
+                           tstate0, tstate1 );
+   sha256_2x_transform_le( tstate0, tstate1, finalblk, finalblk,
+                           tstate0, tstate1 );
 
    memcpy( buf0, tstate0, 32 );
    memcpy( buf0 + 8, outerpad, 32 );
    memcpy( buf1, tstate1, 32 );
    memcpy( buf1 + 8, outerpad, 32 );
 
-   sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
-                               ostate0, ostate1 );
+   sha256_2x_transform_le( ostate0, ostate1, buf0, buf1,
+                           ostate0, ostate1 );
 
    for ( i = 0; i < 8; i++ )
    {
@@ -272,8 +272,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
 
 #endif
 
-#ifdef HAVE_SHA256_4WAY
-
 static const uint32_t keypad_4way[4 * 12] = {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -335,14 +333,14 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
 
 static inline void sha256_4way_init_state( void *state )
 {
-   casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 );
-   casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 );
-   casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 );
-   casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A );
-   casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F );
-   casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C );
-   casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB );
-   casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 );
+   casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
+   casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
+   casti_v128( state, 2 ) = v128_32( 0x3C6EF372 );
+   casti_v128( state, 3 ) = v128_32( 0xA54FF53A );
+   casti_v128( state, 4 ) = v128_32( 0x510E527F );
+   casti_v128( state, 5 ) = v128_32( 0x9B05688C );
+   casti_v128( state, 6 ) = v128_32( 0x1F83D9AB );
+   casti_v128( state, 7 ) = v128_32( 0x5BE0CD19 );
 }
 
 static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
@@ -356,22 +354,22 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
 	memcpy( pad, key + 4*16, 4*16 );
 	memcpy( pad + 4*4, keypad_4way, 4*48 );
 
-   sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad,
+                             (const v128_t*)tstate );
 
    sha256_4way_init_state( tstate );
 
 	for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x5c5c5c5c;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x5c5c5c5c;
 
-   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad,
+                             (const v128_t*)tstate );
    
    for ( i = 0; i < 4*8; i++ )  pad[i] = ihash[i] ^ 0x36363636;
 	for ( ; i < 4*16; i++ )      pad[i] = 0x36363636;
 
-   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad,
+                             (const v128_t*)tstate );
 }
 
 static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
@@ -383,8 +381,8 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 	uint32_t _ALIGN(16) obuf[4 * 16];
 	int i, j;
 
-   sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt,
-                             (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt,
+                             (const v128_t*)tstate );
 	
 	memcpy(ibuf, salt + 4 * 16, 4 * 16);
 	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
@@ -397,11 +395,11 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 		ibuf[4 * 4 + 2] = i + 1;
 		ibuf[4 * 4 + 3] = i + 1;
 
-      sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf,
-                                (const __m128i*)istate );
+      sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
+                                (const v128_t*)istate );
       
-      sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf,
-                                (const __m128i*)ostate );
+      sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
+                                (const v128_t*)ostate );
 
       for ( j = 0; j < 4 * 8; j++ )
 			output[4 * 8 * i + j] = bswap_32( ostate2[j] );
@@ -411,38 +409,36 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
 static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
                uint32_t *ostate, const uint32_t *salt, uint32_t *output )
 {
-   __m128i _ALIGN(64) final[ 8*16 ];
+   v128_t _ALIGN(64) final[ 8*16 ];
 	uint32_t _ALIGN(64) buf[4 * 16];
 	int i;
 	
-   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt,
-                       (const __m128i*)tstate );
-   sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16),
-                       (const __m128i*)tstate );
+   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt,
+                       (const v128_t*)tstate );
+   sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
+                       (const v128_t*)tstate );
 
-   final[ 0] = _mm_set1_epi32( 0x00000001 );
-   final[ 1] = _mm_set1_epi32( 0x80000000 );
+   final[ 0] = v128_32( 0x00000001 );
+   final[ 1] = v128_32( 0x80000000 );
    final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
              = final[ 7] = final[ 8] = final[ 9] = final[10]
              = final[11] = final[12] = final[13] = final[14]
-             = _mm_setzero_si128();
-   final[15] = _mm_set1_epi32 ( 0x00000620 );
+             = v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
+   final[15] = v128_32 ( 0x00000620 );
 
-   sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final,
-                       (const __m128i*)tstate );
+   sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final,
+                       (const v128_t*)tstate );
    
    memcpy(buf, tstate, 4 * 32);
 	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
 
-   sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf,
-                             (const __m128i*)ostate );
+   sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf,
+                             (const v128_t*)ostate );
 
    for ( i = 0; i < 4 * 8; i++ )
 		output[i] = bswap_32( ostate[i] );
 }
 
-#endif /* HAVE_SHA256_4WAY */
-
 
 #ifdef HAVE_SHA256_8WAY
 
@@ -878,9 +874,9 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
    // SSE2 working
    intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
    intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
-   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N ); 
+   scrypt_core_4way( (v128_t*) W,      (v128_t*)V, N ); 
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); 
+   scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N ); 
    dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
    dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
 */
@@ -1016,13 +1012,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
    intrlv_4x32( W+256,     X+256,     X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
    intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
-   scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)W, (v128_t*)V, N );
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)(W+128), (v128_t*)V, N );
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)(W+256), (v128_t*)V, N );
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
+   scrypt_core_simd128_4way( (v128_t*)(W+256+128), (v128_t*)V, N );
    dintrlv_4x32( X,     X+ 32, X+ 64, X+ 96, W,     1024 );
    dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
    dintrlv_4x32( X+256,     X+256+ 32, X+256+ 64, X+256+ 96, W+256,     1024 );
@@ -1138,9 +1134,9 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
    // SSE2 working
    intrlv_4x32( W,     X,      X+ 32,  X+ 64, X+ 96, 1024 );
    intrlv_4x32( W+128, X+128 , X+160,  X+192, X+224, 1024 );
-   scrypt_core_4way( (__m128i*) W,      (__m128i*)V, N );
+   scrypt_core_4way( (v128_t*) W,      (v128_t*)V, N );
    if ( work_restart[thrid].restart ) return 0;
-   scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N );
+   scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N );
    dintrlv_4x32( X,     X+ 32,  X+ 64, X+ 96, W,     1024 );
    dintrlv_4x32( X+128, X+160,  X+192, X+224, W+128, 1024 );
 */
@@ -1339,7 +1335,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
 
    intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
    for ( int i = 0; i < 8; i++ )
-      casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] );
+      casti_v128( tstate, i ) = v128_32( midstate[i] );
 
    HMAC_SHA256_80_init_4way(W, tstate, ostate);
    PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
@@ -1354,7 +1350,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
       intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
    }
    else
-      scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
+      scrypt_core_4way( (v128_t*)W, (v128_t*)scratchbuf, N );
 
 
 
@@ -1364,7 +1360,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input,	uint32_t *output,
 
    
    // working, simple 4 way parallel, best for scrypt
-//   scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
+//   scrypt_core_4way( (v128_t*)W, (v128_t*)V, N );
 
 /*   
    // Working Linear single threaded SIMD
diff --git a/algo/sha/hmac-sha256-hash-4way.c b/algo/sha/hmac-sha256-hash-4way.c
index 43fa272..c039ac9 100644
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -31,6 +31,7 @@
 #include "hmac-sha256-hash-4way.h"
 #include "compat.h"
 
+#if defined(__SSE2__)
 // HMAC 4-way SSE2
 
 /**
@@ -169,6 +170,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	}
 }
 
+#endif
+
 #if defined(__AVX2__)
 
 // HMAC 8-way AVX2
diff --git a/algo/sha/hmac-sha256-hash-4way.h b/algo/sha/hmac-sha256-hash-4way.h
index 31d51cd..c096b08 100644
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -38,6 +38,7 @@
 #include "simd-utils.h"
 #include "sha256-hash.h"
 
+#if defined(__SSE2__)
 typedef struct _hmac_sha256_4way_context
 {
    sha256_4way_context ictx;
@@ -60,6 +61,8 @@ void hmac_sha256_4way_full( void*, const void *, size_t Klen, const void *,
 void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,
                          const uint8_t *, size_t, uint64_t );
 
+#endif
+
 #if defined(__AVX2__)
 
 typedef struct _hmac_sha256_8way_context
@@ -78,7 +81,9 @@ void hmac_sha256_8way_full( void*, const void *, size_t Klen, const void *,
 
 void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
                         const uint8_t *, size_t, uint64_t );
-      
+
+#endif  // AVX2
+       
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 typedef struct _hmac_sha256_16way_context
@@ -100,8 +105,6 @@ void pbkdf2_sha256_16way( uint8_t *, size_t, const uint8_t *, size_t,
                           const uint8_t *, size_t, uint64_t );
 
 
-
 #endif   // AVX512
-#endif   // AVX2
 
 #endif // HMAC_SHA256_4WAY_H__
diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c
index 0bf2fff..358b565 100644
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -666,6 +666,9 @@ bool register_sha256d_algo( algo_gate_t* gate )
 #elif defined(SHA256D_SHA)
    gate->optimizations = SHA_OPT;
    gate->scanhash = (void*)&scanhash_sha256d_sha;
+#elif defined(SHA256D_NEON_SHA2)
+   gate->optimizations = SHA_OPT;
+   gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
 //#elif defined(SHA256D_8WAY)
 //   gate->scanhash = (void*)&scanhash_sha256d_8way;
 #else
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index 103bd6a..4a7119a 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -1,6 +1,3 @@
-
-#if defined(__SSE2__)
-
 #include <stddef.h>
 #include <string.h>
 #include "sha256-hash.h"
@@ -36,30 +33,29 @@ static const uint32_t K256[64] =
 // SHA-256 4 way SSE2
 
 #define CHs(X, Y, Z) \
-   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
+   v128_xor( v128_and( v128_xor( Y, Z ), X ), Z ) 
 
 #define MAJs(X, Y, Z) \
-  _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
-                                   Y_xor_Z ) )
+  v128_xor( Y, v128_and( X_xor_Y = v128_xor( X, Y ), Y_xor_Z ) )
 
 #define BSG2_0(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x,  2), mm128_ror_32(x, 13) ), mm128_ror_32( x, 22) )
+   v128_xor( v128_xor( \
+        v128_ror32(x,  2), v128_ror32(x, 13) ), v128_ror32( x, 22) )
 
 #define BSG2_1(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x,  6), mm128_ror_32(x, 11) ), mm128_ror_32( x, 25) )
+   v128_xor( v128_xor( \
+        v128_ror32(x,  6), v128_ror32(x, 11) ), v128_ror32( x, 25) )
 
 #define SSG2_0(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x,  7), mm128_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) 
+   v128_xor( v128_xor( \
+        v128_ror32(x,  7), v128_ror32(x, 18) ), v128_sr32(x, 3) ) 
 
 #define SSG2_1(x) \
-   _mm_xor_si128( _mm_xor_si128( \
-        mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
+   v128_xor( v128_xor( \
+        v128_ror32(x, 17), v128_ror32(x, 19) ), v128_sr32(x, 10) )
 
 #define SHA2s_MEXP( a, b, c, d ) \
-  mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
+  v128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
 
 #define SHA256x4_MSG_EXPANSION( W ) \
    W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
@@ -81,19 +77,19 @@ static const uint32_t K256[64] =
 
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
-  __m128i T1, T2; \
-  __m128i K = v128_32( K256[( (j)+(i) )] ); \
-  T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
+  v128_t T1, T2; \
+  v128_t K = v128_32( K256[( (j)+(i) )] ); \
+  T1 = v128_add32( H, v128_add4_32( BSG2_1(E), CHs(E, F, G), \
                                         K, W[i] ) ); \
-  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
+  T2 = v128_add32( BSG2_0(A), MAJs(A, B, C) ); \
   Y_xor_Z = X_xor_Y; \
-  D  = _mm_add_epi32( D,  T1 ); \
-  H  = _mm_add_epi32( T1, T2 ); \
+  D  = v128_add32( D,  T1 ); \
+  H  = v128_add32( T1, T2 ); \
 } while (0)
 
 #define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
 { \
-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); \
    SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j ); \
    SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j ); \
    SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j ); \
@@ -113,10 +109,10 @@ do { \
 }
 
 // LE data, no need to byte swap
-static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
-                                          const __m128i *in )
+static inline void SHA256_4WAY_TRANSFORM( v128_t *out, v128_t *W,
+                                          const v128_t *in )
 {
-   __m128i A, B, C, D, E, F, G, H;
+   v128_t A, B, C, D, E, F, G, H;
 
    A = in[0];
    B = in[1];
@@ -135,109 +131,102 @@ static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
    SHA256x4_MSG_EXPANSION( W );
    SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
    
-   out[0] = _mm_add_epi32( in[0], A );
-   out[1] = _mm_add_epi32( in[1], B );
-   out[2] = _mm_add_epi32( in[2], C );
-   out[3] = _mm_add_epi32( in[3], D );
-   out[4] = _mm_add_epi32( in[4], E );
-   out[5] = _mm_add_epi32( in[5], F );
-   out[6] = _mm_add_epi32( in[6], G );
-   out[7] = _mm_add_epi32( in[7], H );
+   out[0] = v128_add32( in[0], A );
+   out[1] = v128_add32( in[1], B );
+   out[2] = v128_add32( in[2], C );
+   out[3] = v128_add32( in[3], D );
+   out[4] = v128_add32( in[4], E );
+   out[5] = v128_add32( in[5], F );
+   out[6] = v128_add32( in[6], G );
+   out[7] = v128_add32( in[7], H );
 }
 
 // LE data, no need to byte swap
-void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
-                               const __m128i *state_in )
+void sha256_4way_transform_le( v128_t *state_out, const v128_t *data,
+                               const v128_t *state_in )
 {
-   __m128i W[16];
-   memcpy_128( W, data, 16 );
+   v128_t W[16];
+   v128_memcpy( W, data, 16 );
    SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
 // BE data, need to byte swap input data
-void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
-                               const __m128i *state_in )
+void sha256_4way_transform_be( v128_t *state_out, const v128_t *data,
+                               const v128_t *state_in )
 {
-   __m128i W[16];
-   mm128_block_bswap_32( W, data );
-   mm128_block_bswap_32( W+8, data+8 );
+   v128_t W[16];
+   v128_block_bswap32( W, data );
+   v128_block_bswap32( W+8, data+8 );
    SHA256_4WAY_TRANSFORM( state_out, W, state_in );
 }
 
 // prehash_3rounds & final_rounds are not working
-void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
-                                   const __m128i *W, const __m128i *state_in )
+void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
+                                   const v128_t *W, const v128_t *state_in )
 {
-   __m128i A, B, C, D, E, F, G, H;
+   v128_t A, B, C, D, E, F, G, H;
 
    // precalculate constant part msg expansion for second iteration.
    X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
    X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
-   X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ),
-                          W[ 2] );
-   X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ),
-                          SSG2_0( W[ 4] ) );
-   X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ),
-                          W[ 4] );
-   X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ),
-                          W[ 5] );
-   X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ),
-                          W[ 6] );
-   X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ),
-                          W[ 7] );
-   X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ),
-                          W[ 8] );
-   X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] );
-   X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] );
-   X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] );
-   X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] );
-   X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] );
-   X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] );
-   X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] );
+   X[ 2] = v128_add32( v128_add32( SSG2_1( X[ 0] ), W[11] ), W[ 2] );
+   X[ 3] = v128_add32( v128_add32( SSG2_1( X[ 1] ), W[12] ), SSG2_0( W[ 4] ) );
+   X[ 4] = v128_add32( v128_add32( W[13], SSG2_0( W[ 5] ) ), W[ 4] );
+   X[ 5] = v128_add32( v128_add32( W[14], SSG2_0( W[ 6] ) ), W[ 5] );
+   X [6] = v128_add32( v128_add32( W[15], SSG2_0( W[ 7] ) ), W[ 6] );
+   X[ 7] = v128_add32( v128_add32( X[ 0], SSG2_0( W[ 8] ) ), W[ 7] );
+   X[ 8] = v128_add32( v128_add32( X[ 1], SSG2_0( W[ 9] ) ), W[ 8] );
+   X[ 9] = v128_add32( SSG2_0( W[10] ), W[ 9] );
+   X[10] = v128_add32( SSG2_0( W[11] ), W[10] );
+   X[11] = v128_add32( SSG2_0( W[12] ), W[11] );
+   X[12] = v128_add32( SSG2_0( W[13] ), W[12] );
+   X[13] = v128_add32( SSG2_0( W[14] ), W[13] );
+   X[14] = v128_add32( SSG2_0( W[15] ), W[14] );
+   X[15] = v128_add32( SSG2_0( X[ 0] ), W[15] );
 
-   A = _mm_load_si128( state_in     );
-   B = _mm_load_si128( state_in + 1 );
-   C = _mm_load_si128( state_in + 2 );
-   D = _mm_load_si128( state_in + 3 );
-   E = _mm_load_si128( state_in + 4 );
-   F = _mm_load_si128( state_in + 5 );
-   G = _mm_load_si128( state_in + 6 );
-   H = _mm_load_si128( state_in + 7 );
+   A = v128_load( state_in     );
+   B = v128_load( state_in + 1 );
+   C = v128_load( state_in + 2 );
+   D = v128_load( state_in + 3 );
+   E = v128_load( state_in + 4 );
+   F = v128_load( state_in + 5 );
+   G = v128_load( state_in + 6 );
+   H = v128_load( state_in + 7 );
 
-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
    
    SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
    SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
    SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
    
-   _mm_store_si128( state_mid    , A );
-   _mm_store_si128( state_mid + 1, B );
-   _mm_store_si128( state_mid + 2, C );
-   _mm_store_si128( state_mid + 3, D );
-   _mm_store_si128( state_mid + 4, E );
-   _mm_store_si128( state_mid + 5, F );
-   _mm_store_si128( state_mid + 6, G );
-   _mm_store_si128( state_mid + 7, H );
+   v128_store( state_mid    , A );
+   v128_store( state_mid + 1, B );
+   v128_store( state_mid + 2, C );
+   v128_store( state_mid + 3, D );
+   v128_store( state_mid + 4, E );
+   v128_store( state_mid + 5, F );
+   v128_store( state_mid + 6, G );
+   v128_store( state_mid + 7, H );
 }
 
-void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
-          const __m128i *state_in, const __m128i *state_mid, const __m128i *X )
+void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
+          const v128_t *state_in, const v128_t *state_mid, const v128_t *X )
 {
-   __m128i A, B, C, D, E, F, G, H;
-   __m128i W[16];
+   v128_t A, B, C, D, E, F, G, H;
+   v128_t W[16];
 
-   memcpy_128( W, data, 16 );
+   v128_memcpy( W, data, 16 );
 
-   A = _mm_load_si128( state_mid     );
-   B = _mm_load_si128( state_mid + 1 );
-   C = _mm_load_si128( state_mid + 2 );
-   D = _mm_load_si128( state_mid + 3 );
-   E = _mm_load_si128( state_mid + 4 );
-   F = _mm_load_si128( state_mid + 5 );
-   G = _mm_load_si128( state_mid + 6 );
-   H = _mm_load_si128( state_mid + 7 );
+   A = v128_load( state_mid     );
+   B = v128_load( state_mid + 1 );
+   C = v128_load( state_mid + 2 );
+   D = v128_load( state_mid + 3 );
+   E = v128_load( state_mid + 4 );
+   F = v128_load( state_mid + 5 );
+   G = v128_load( state_mid + 6 );
+   H = v128_load( state_mid + 7 );
 
-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H );
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( G, H );
 
    SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
    SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
@@ -256,27 +245,20 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
    // update precalculated msg expansion with new nonce: W[3].
    W[ 0] = X[ 0];
    W[ 1] = X[ 1];
-   W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) );
-   W[ 3] = _mm_add_epi32( X[ 3], W[ 3] );
-   W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) );
-   W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) );
-   W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) );
-   W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) );
-   W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) );
-   W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ),
-                                                W[ 2] ) );
-   W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ),
-                                                W[ 3] ) );
-   W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ),
-                                                W[ 4] ) );
-   W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ),
-                                                W[ 5] ) );
-   W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ),
-                                                W[ 6] ) );
-   W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ),
-                                                W[ 7] ) );
-   W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ),
-                                                W[ 8] ) );
+   W[ 2] = v128_add32( X[ 2], SSG2_0( W[ 3] ) );
+   W[ 3] = v128_add32( X[ 3], W[ 3] );
+   W[ 4] = v128_add32( X[ 4], SSG2_1( W[ 2] ) );
+   W[ 5] = v128_add32( X[ 5], SSG2_1( W[ 3] ) );
+   W[ 6] = v128_add32( X[ 6], SSG2_1( W[ 4] ) );
+   W[ 7] = v128_add32( X[ 7], SSG2_1( W[ 5] ) );
+   W[ 8] = v128_add32( X[ 8], SSG2_1( W[ 6] ) );
+   W[ 9] = v128_add32( X[ 9], v128_add32( SSG2_1( W[ 7] ), W[ 2] ) );
+   W[10] = v128_add32( X[10], v128_add32( SSG2_1( W[ 8] ), W[ 3] ) );
+   W[11] = v128_add32( X[11], v128_add32( SSG2_1( W[ 9] ), W[ 4] ) );
+   W[12] = v128_add32( X[12], v128_add32( SSG2_1( W[10] ), W[ 5] ) );
+   W[13] = v128_add32( X[13], v128_add32( SSG2_1( W[11] ), W[ 6] ) );
+   W[14] = v128_add32( X[14], v128_add32( SSG2_1( W[12] ), W[ 7] ) );
+   W[15] = v128_add32( X[15], v128_add32( SSG2_1( W[13] ), W[ 8] ) );
 
    SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
    SHA256x4_MSG_EXPANSION( W );
@@ -284,45 +266,47 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
    SHA256x4_MSG_EXPANSION( W );
    SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
 
-   A = _mm_add_epi32( A, _mm_load_si128( state_in     ) );
-   B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) );
-   C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) );
-   D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) );
-   E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) );
-   F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) );
-   G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) );
-   H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) );
+   A = v128_add32( A, v128_load( state_in     ) );
+   B = v128_add32( B, v128_load( state_in + 1 ) );
+   C = v128_add32( C, v128_load( state_in + 2 ) );
+   D = v128_add32( D, v128_load( state_in + 3 ) );
+   E = v128_add32( E, v128_load( state_in + 4 ) );
+   F = v128_add32( F, v128_load( state_in + 5 ) );
+   G = v128_add32( G, v128_load( state_in + 6 ) );
+   H = v128_add32( H, v128_load( state_in + 7 ) );
 
-   _mm_store_si128( state_out    ,  A );
-   _mm_store_si128( state_out + 1,  B );
-   _mm_store_si128( state_out + 2,  C );
-   _mm_store_si128( state_out + 3,  D );
-   _mm_store_si128( state_out + 4,  E );
-   _mm_store_si128( state_out + 5,  F );
-   _mm_store_si128( state_out + 6,  G );
-   _mm_store_si128( state_out + 7,  H );
+   v128_store( state_out    ,  A );
+   v128_store( state_out + 1,  B );
+   v128_store( state_out + 2,  C );
+   v128_store( state_out + 3,  D );
+   v128_store( state_out + 4,  E );
+   v128_store( state_out + 5,  F );
+   v128_store( state_out + 6,  G );
+   v128_store( state_out + 7,  H );
 }
 
+# if 0
+
 // Working correctly but still slower
-int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                            const __m128i *state_in, const uint32_t *target )
+int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
+                            const v128_t *state_in, const uint32_t *target )
 {
-   __m128i A, B, C, D, E, F, G, H, T0, T1, T2;
-   __m128i vmask, targ, hash;
+   v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
+   v128_t vmask, targ, hash;
    int t6_mask, flip;
-   __m128i W[16];      memcpy_128( W, data, 16 );
+   v128_t W[16];      memcpy_128( W, data, 16 );
 
-   A = _mm_load_si128( state_in   );
-   B = _mm_load_si128( state_in+1 );
-   C = _mm_load_si128( state_in+2 );
-   D = _mm_load_si128( state_in+3 );
-   E = _mm_load_si128( state_in+4 );
-   F = _mm_load_si128( state_in+5 );
-   G = _mm_load_si128( state_in+6 );
-   H = _mm_load_si128( state_in+7 );
+   A = v128_load( state_in   );
+   B = v128_load( state_in+1 );
+   C = v128_load( state_in+2 );
+   D = v128_load( state_in+3 );
+   E = v128_load( state_in+4 );
+   F = v128_load( state_in+5 );
+   G = v128_load( state_in+6 );
+   H = v128_load( state_in+7 );
 
-   const __m128i IV7 = H;
-   const __m128i IV6 = G;
+   const v128_t IV7 = H;
+   const v128_t IV6 = G;
 
    SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
    SHA256x4_MSG_EXPANSION( W );
@@ -344,7 +328,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
    W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] );
    W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] );
 
-   __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
+   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
    
    SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 48 );
    SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 48 );
@@ -357,65 +341,64 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
    SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
    SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 48 );
 
-   T0 = _mm_add_epi32( v128_32( K256[58] ),
-                   mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
-   B = _mm_add_epi32( B, T0 );
+   T0 = v128_add32( v128_32( K256[58] ),
+                   v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
+   B = v128_add32( B, T0 );
 
-   T1 = _mm_add_epi32( v128_32( K256[59] ),
-                    mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
-   A = _mm_add_epi32( A, T1 );
+   T1 = v128_add32( v128_32( K256[59] ),
+                    v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
+   A = v128_add32( A, T1 );
 
-   T2 = _mm_add_epi32( v128_32( K256[60] ),
-                    mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
-   H = _mm_add_epi32( H, T2 );
+   T2 = v128_add32( v128_32( K256[60] ),
+                    v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
+   H = v128_add32( H, T2 );
 
    targ = v128_32( target[7] );
-   hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
+   hash = v128_bswap32( v128_add32( H, IV7 ) );
 
-   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
+   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
 
-   if ( likely( 0xf == ( flip ^
-                    mm128_movmask_32( _mm_cmpgt_epi32( hash, targ ) ) ) ))
+   if ( likely(
+             0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
    return 0;
 
-   t6_mask = mm128_movmask_32( vmask =_mm_cmpeq_epi32( hash, targ ) );
+   t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
 
    // round 58 part 2
-   F = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( G ), MAJs( G, H, A ) ) );
+   F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
 
    // round 61  part 1
    W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm_add_epi32( v128_32( K256[61] ),
-                 mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
-   G = _mm_add_epi32( G, T0 );
+   T0 = v128_add32( v128_32( K256[61] ),
+                    v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
+   G = v128_add32( G, T0 );
 
    if ( t6_mask )
    {
-      targ = _mm_and_si128( vmask, v128_32( target[6] ) );
-      hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
+      targ = v128_and( vmask, v128_32( target[6] ) );
+      hash = v128_bswap32( v128_add32( G, IV6 ) );
 
-      if ( ( 0 != ( t6_mask & mm128_movmask_32(
-                                 _mm_cmpeq_epi32( hash, targ ) ) ) ))
+      if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
          return 0;
       else
       {
-         flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
-         if ( 0 != ( t6_mask & ( flip ^ mm128_movmask_32(
-                                       _mm_cmpgt_epi32( hash, targ ) ) ) ) )
+         flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
+         if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
+                                             v128_cmpgt32( hash, targ ) ) ) ) )
             return 0;
           else if ( target[6] == 0x80000000 )
           {
-             if ( 0 == ( t6_mask & mm128_movmask_32(
-                   _mm_cmpgt_epi32( hash, _mm_xor_si128( hash, hash ) ) ) ) )
+             if ( 0 == ( t6_mask & v128_movmask32(
+                            v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
                 return 0;
           }
        }
    }
    
    // rounds 59 to 61 part 2
-   E = _mm_add_epi32( T1, _mm_add_epi32( BSG2_0( F ), MAJs( F, G, H ) ) );
-   D = _mm_add_epi32( T2, _mm_add_epi32( BSG2_0( E ), MAJs( E, F, G ) ) );
-   C = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( D ), MAJs( D, E, F ) ) );
+   E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
+   D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
+   C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
 
    // rounds 62 & 63
    W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] );
@@ -424,17 +407,18 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
    SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
    SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
 
-   state_out[0] = _mm_add_epi32( state_in[0], A );
-   state_out[1] = _mm_add_epi32( state_in[1], B );
-   state_out[2] = _mm_add_epi32( state_in[2], C );
-   state_out[3] = _mm_add_epi32( state_in[3], D );
-   state_out[4] = _mm_add_epi32( state_in[4], E );
-   state_out[5] = _mm_add_epi32( state_in[5], F );
-   state_out[6] = _mm_add_epi32( state_in[6], G );
-   state_out[7] = _mm_add_epi32( state_in[7], H );
+   state_out[0] = v128_add32( state_in[0], A );
+   state_out[1] = v128_add32( state_in[1], B );
+   state_out[2] = v128_add32( state_in[2], C );
+   state_out[3] = v128_add32( state_in[3], D );
+   state_out[4] = v128_add32( state_in[4], E );
+   state_out[5] = v128_add32( state_in[5], F );
+   state_out[6] = v128_add32( state_in[6], G );
+   state_out[7] = v128_add32( state_in[7], H );
 return 1;
 }
 
+#endif
 
 void sha256_4way_init( sha256_4way_context *sc )
 {
@@ -451,7 +435,7 @@ void sha256_4way_init( sha256_4way_context *sc )
 
 void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
 {
-   __m128i *vdata = (__m128i*)data;
+   v128_t *vdata = (v128_t*)data;
    size_t ptr;
    const int buf_size = 64;
 
@@ -464,7 +448,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
       clen = buf_size - ptr;
       if ( clen > len )
          clen = len;
-      memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
+      v128_memcpy( sc->buf + (ptr>>2), vdata, clen>>2 );
       vdata = vdata + (clen>>2);
       ptr += clen;
       len -= clen;
@@ -494,12 +478,12 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
 
     if ( ptr > pad )
     {
-         memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+         v128_memset_zero( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
          sha256_4way_transform_be( sc->val, sc->buf, sc->val );
-         memset_zero_128( sc->buf, pad >> 2 );
+         v128_memset_zero( sc->buf, pad >> 2 );
     }
     else
-         memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+         v128_memset_zero( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
 
     low = sc->count_low;
     high = (sc->count_high << 3) | (low >> 29);
@@ -509,7 +493,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
     sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
     sha256_4way_transform_be( sc->val, sc->buf, sc->val );
 
-    mm128_block_bswap_32( dst, sc->val );
+    v128_block_bswap32( dst, sc->val );
 }
 
 void sha256_4way_full( void *dst, const void *data, size_t len )
@@ -1725,4 +1709,3 @@ void sha256_16way_full( void *dst, const void *data, size_t len )
 
 #endif  // AVX512
 #endif  // __AVX2__
-#endif  // __SSE2__
diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c
index f06a63e..e190f07 100644
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -6,1086 +6,531 @@ static const uint32_t SHA256_IV[8] =
    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 
-#if defined(__SHA__)
+#if defined(__x86_64__) && defined(__SHA__)
+
+#define sha256_opt_rounds( state_out, input, state_in ) \
+{ \
+    __m128i STATE0, STATE1; \
+    __m128i MSG, TMP; \
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3; \
+    __m128i ABEF_SAVE, CDGH_SAVE; \
+\
+    TMP    = _mm_load_si128( (__m128i*) &state_in[0] ); \
+    STATE1 = _mm_load_si128( (__m128i*) &state_in[4] ); \
+\
+    TMP = _mm_shuffle_epi32( TMP, 0xB1 ); \
+    STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); \
+    STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 ); \
+    STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); \
+\
+    ABEF_SAVE = STATE0; \
+    CDGH_SAVE = STATE1; \
+\
+    TMSG0 = load_msg( input, 0 ); \
+    TMSG1 = load_msg( input, 1 ); \
+    TMSG2 = load_msg( input, 2 ); \
+    TMSG3 = load_msg( input, 3 ); \
+    /* Rounds 0-3 */ \
+    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, \
+                                                0x71374491428A2F98ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    /* Rounds 4-7 */ \
+    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, \
+                                                0x59F111F13956C25BULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); \
+    /* Rounds 8-11 */ \
+    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x550C7DC3243185BEULL, \
+                                                0x12835B01D807AA98ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); \
+    /* Rounds 12-15 */ \
+    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, \
+                                                0x80DEB1FE72BE5D74ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); \
+    TMSG0 = _mm_add_epi32( TMSG0, TMP ); \
+    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); \
+    /* Rounds 16-19 */ \
+    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, \
+                                                0xEFBE4786E49B69C1ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); \
+    TMSG1 = _mm_add_epi32( TMSG1, TMP ); \
+    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); \
+    /* Rounds 20-23 */ \
+    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, \
+                                                0x4A7484AA2DE92C6FULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); \
+    TMSG2 = _mm_add_epi32( TMSG2, TMP ); \
+    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); \
+    /* Rounds 24-27 */ \
+    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xBF597FC7B00327C8ULL, \
+                                                0xA831C66D983E5152ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); \
+    TMSG3 = _mm_add_epi32( TMSG3, TMP ); \
+    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); \
+    /* Rounds 28-31 */ \
+    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x1429296706CA6351ULL, \
+                                                0xD5A79147C6E00BF3ULL)); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); \
+    TMSG0 = _mm_add_epi32( TMSG0, TMP ); \
+    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); \
+    /* Rounds 32-35 */ \
+    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x53380D134D2C6DFCULL, \
+                                                0x2E1B213827B70A85ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); \
+    TMSG1 = _mm_add_epi32( TMSG1, TMP ); \
+    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); \
+    /* Rounds 36-39 */ \
+    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x92722C8581C2C92EULL, \
+                                                0x766A0ABB650A7354ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); \
+    TMSG2 = _mm_add_epi32( TMSG2, TMP ); \
+    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); \
+    /* Rounds 40-43 */ \
+    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, \
+                                                0xA81A664BA2BFE8A1ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); \
+    TMSG3 = _mm_add_epi32( TMSG3, TMP ); \
+    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); \
+    /* Rounds 44-47 */ \
+    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x106AA070F40E3585ULL, \
+                                                0xD6990624D192E819ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); \
+    TMSG0 = _mm_add_epi32( TMSG0, TMP ); \
+    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); \
+    /* Rounds 48-51 */ \
+    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x34B0BCB52748774CULL, \
+                                                0x1E376C0819A4C116ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); \
+    TMSG1 = _mm_add_epi32( TMSG1, TMP ); \
+    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); \
+    /* rounds 52-55 */ \
+    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, \
+                                                0x4ED8AA4A391C0CB3ULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); \
+    TMSG2 = _mm_add_epi32( TMSG2, TMP ); \
+    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    /* Rounds 56-59 */ \
+    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x8CC7020884C87814ULL, \
+                                                0x78A5636F748F82EEULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); \
+    TMSG3 = _mm_add_epi32( TMSG3, TMP ); \
+    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); \
+    MSG = _mm_shuffle_epi32( MSG, 0x0E ); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+    /* Rounds 60-63 */ \
+    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, \
+                                                0xA4506CEB90BEFFFAULL) ); \
+    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \
+    MSG = _mm_shuffle_epi32(MSG, 0x0E); \
+    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \
+\
+    STATE0 = _mm_add_epi32( STATE0, ABEF_SAVE ); \
+    STATE1 = _mm_add_epi32( STATE1, CDGH_SAVE ); \
+\
+    TMP = _mm_shuffle_epi32( STATE0, 0x1B ); \
+    STATE1 = _mm_shuffle_epi32( STATE1, 0xB1 ); \
+    STATE0 = _mm_blend_epi16( TMP, STATE1, 0xF0 ); \
+    STATE1 = _mm_alignr_epi8( STATE1, TMP, 8 ); \
+\
+    _mm_store_si128( (__m128i*) &state_out[0], STATE0 ); \
+    _mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \
+}
 
 void sha256_opt_transform_le( uint32_t *state_out, const void *input,
                               const uint32_t *state_in )
 {
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP    = _mm_load_si128( (__m128i*) &state_in[0] );
-    STATE1 = _mm_load_si128( (__m128i*) &state_in[4] );
-
-    TMP = _mm_shuffle_epi32( TMP, 0xB1 );          // CDAB
-    STATE1 = _mm_shuffle_epi32( STATE1, 0x1B );    // EFGH
-    STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 );    // ABEF
-    STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128( (const __m128i*) (input+0) );
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL,
-                                                0x71374491428A2F98ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128( (const __m128i*) (input+16) );
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL,
-                                                0x59F111F13956C25BULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 );
-
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128( (const __m128i*) (input+32) );
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x550C7DC3243185BEULL,
-                                                0x12835B01D807AA98ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 );
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128( (const __m128i*) (input+48) );
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC19BF1749BDC06A7ULL,
-                                               0x80DEB1FE72BE5D74ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 );
-    TMSG0 = _mm_add_epi32( TMSG0, TMP );
-    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 );
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL,
-                                                0xEFBE4786E49B69C1ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 );
-    TMSG1 = _mm_add_epi32( TMSG1, TMP );
-    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 );
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x76F988DA5CB0A9DCULL,
-                                                0x4A7484AA2DE92C6FULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 );
-    TMSG2 = _mm_add_epi32( TMSG2, TMP );
-    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 );
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xBF597FC7B00327C8ULL,
-                                                0xA831C66D983E5152ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 );
-    TMSG3 = _mm_add_epi32( TMSG3, TMP );
-    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 );
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x1429296706CA6351ULL,
-                                                0xD5A79147C6E00BF3ULL));
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 );
-    TMSG0 = _mm_add_epi32( TMSG0, TMP );
-    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 );
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x53380D134D2C6DFCULL,
-                                                0x2E1B213827B70A85ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 );
-    TMSG1 = _mm_add_epi32( TMSG1, TMP );
-    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 );
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x92722C8581C2C92EULL,
-                                                0x766A0ABB650A7354ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 );
-    TMSG2 = _mm_add_epi32( TMSG2, TMP );
-    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 );
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xC76C51A3C24B8B70ULL,
-                                                0xA81A664BA2BFE8A1ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 );
-    TMSG3 = _mm_add_epi32( TMSG3, TMP );
-    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 );
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x106AA070F40E3585ULL,
-                                                0xD6990624D192E819ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 );
-    TMSG0 = _mm_add_epi32( TMSG0, TMP );
-    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 );
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x34B0BCB52748774CULL,
-                                                0x1E376C0819A4C116ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 );
-    TMSG1 = _mm_add_epi32( TMSG1, TMP );
-    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 );
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x682E6FF35B9CCA4FULL,
-                                                0x4ED8AA4A391C0CB3ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 );
-    TMSG2 = _mm_add_epi32( TMSG2, TMP );
-    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x8CC7020884C87814ULL,
-                                                0x78A5636F748F82EEULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 );
-    TMSG3 = _mm_add_epi32( TMSG3, TMP );
-    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL,
-                                                0xA4506CEB90BEFFFAULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32(MSG, 0x0E);
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32( STATE0, ABEF_SAVE );
-    STATE1 = _mm_add_epi32( STATE1, CDGH_SAVE );
-
-    TMP = _mm_shuffle_epi32( STATE0, 0x1B );       // FEBA
-    STATE1 = _mm_shuffle_epi32( STATE1, 0xB1 );    // DCHG
-    STATE0 = _mm_blend_epi16( TMP, STATE1, 0xF0 ); // DCBA
-    STATE1 = _mm_alignr_epi8( STATE1, TMP, 8 );    // ABEF
-
-    // Save state
-    _mm_store_si128( (__m128i*) &state_out[0], STATE0 );
-    _mm_store_si128( (__m128i*) &state_out[4], STATE1 );
+#define load_msg( m, i ) casti_v128( m, i )
+   sha256_opt_rounds( state_out, input, state_in );
+#undef load_msg
 }
 
-
 void sha256_opt_transform_be( uint32_t *state_out, const void *input,
                               const uint32_t *state_in )
 {
-    __m128i STATE0, STATE1;
-    __m128i MSG, TMP, BSWAP32;
-    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
-    __m128i ABEF_SAVE, CDGH_SAVE;
-
-    // Load initial values
-    TMP = _mm_load_si128( (__m128i*) &state_in[0] );
-    STATE1 = _mm_load_si128( (__m128i*) &state_in[4] );
-    BSWAP32 = _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-    TMP = _mm_shuffle_epi32( TMP, 0xB1 );          // CDAB
-    STATE1 = _mm_shuffle_epi32( STATE1, 0x1B );    // EFGH
-    STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 );    // ABEF
-    STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH
-
-    // Save current hash
-    ABEF_SAVE = STATE0;
-    CDGH_SAVE = STATE1;
-
-    // Rounds 0-3
-    TMSG0 = _mm_load_si128( (const __m128i*) (input+0) );
-    TMSG0 = _mm_shuffle_epi8( TMSG0, BSWAP32 );
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL,
-                                                0x71374491428A2F98ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Rounds 4-7
-    TMSG1 = _mm_load_si128( (const __m128i*) (input+16) );
-    TMSG1 = _mm_shuffle_epi8( TMSG1, BSWAP32 );
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL,
-                                                0x59F111F13956C25BULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 );
-
-    // Rounds 8-11
-    TMSG2 = _mm_load_si128( (const __m128i*) (input+32) );
-    TMSG2 = _mm_shuffle_epi8( TMSG2, BSWAP32 );
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x550C7DC3243185BEULL,
-                                                0x12835B01D807AA98ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 );
-
-    // Rounds 12-15
-    TMSG3 = _mm_load_si128( (const __m128i*) (input+48)) ;
-    TMSG3 = _mm_shuffle_epi8( TMSG3, BSWAP32 );
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC19BF1749BDC06A7ULL,
-                                                0x80DEB1FE72BE5D74ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 );
-    TMSG0 = _mm_add_epi32( TMSG0, TMP );
-    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 );
-
-    // Rounds 16-19
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL,
-                                                0xEFBE4786E49B69C1ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 );
-    TMSG1 = _mm_add_epi32( TMSG1, TMP );
-    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 );
-
-    // Rounds 20-23
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x76F988DA5CB0A9DCULL,
-                                                0x4A7484AA2DE92C6FULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 );
-    TMSG2 = _mm_add_epi32( TMSG2, TMP );
-    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 );
-
-    // Rounds 24-27
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xBF597FC7B00327C8ULL,
-                                                0xA831C66D983E5152ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 );
-    TMSG3 = _mm_add_epi32( TMSG3, TMP );
-    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 );
-
-    // Rounds 28-31
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x1429296706CA6351ULL,
-                                                0xD5A79147C6E00BF3ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 );
-    TMSG0 = _mm_add_epi32( TMSG0, TMP );
-    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 );
-
-    // Rounds 32-35
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x53380D134D2C6DFCULL,
-                                                0x2E1B213827B70A85ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 );
-    TMSG1 = _mm_add_epi32( TMSG1, TMP );
-    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 );
-
-    // Rounds 36-39
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x92722C8581C2C92EULL,
-                                                0x766A0ABB650A7354ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 );
-    TMSG2 = _mm_add_epi32( TMSG2, TMP );
-    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 );
-
-    // Rounds 40-43
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xC76C51A3C24B8B70ULL,
-                                                0xA81A664BA2BFE8A1ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 );
-    TMSG3 = _mm_add_epi32( TMSG3, TMP );
-    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 );
-
-    // Rounds 44-47
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x106AA070F40E3585ULL,
-                                                0xD6990624D192E819ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 );
-    TMSG0 = _mm_add_epi32( TMSG0, TMP );
-    TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 );
-
-    // Rounds 48-51
-    MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x34B0BCB52748774CULL,
-                                                0x1E376C0819A4C116ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 );
-    TMSG1 = _mm_add_epi32( TMSG1, TMP );
-    TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-    TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 );
-
-    // Rounds 52-55
-    MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x682E6FF35B9CCA4FULL,
-                                                0x4ED8AA4A391C0CB3ULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 );
-    TMSG2 = _mm_add_epi32( TMSG2, TMP );
-    TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Rounds 56-59
-    MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x8CC7020884C87814ULL,
-                                                0x78A5636F748F82EEULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 );
-    TMSG3 = _mm_add_epi32( TMSG3, TMP );
-    TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Rounds 60-63
-    MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL,
-                                                0xA4506CEB90BEFFFAULL) );
-    STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG );
-    MSG = _mm_shuffle_epi32( MSG, 0x0E );
-    STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG );
-
-    // Add values back to state
-    STATE0 = _mm_add_epi32( STATE0, ABEF_SAVE );
-    STATE1 = _mm_add_epi32( STATE1, CDGH_SAVE );
-
-    TMP = _mm_shuffle_epi32( STATE0, 0x1B );       // FEBA
-    STATE1 = _mm_shuffle_epi32( STATE1, 0xB1 );    // DCHG
-    STATE0 = _mm_blend_epi16( TMP, STATE1, 0xF0 ); // DCBA
-    STATE1 = _mm_alignr_epi8( STATE1, TMP, 8 );    // ABEF
-
-    // Save state
-    _mm_store_si128( (__m128i*) &state_out[0], STATE0 );
-    _mm_store_si128( (__m128i*) &state_out[4], STATE1 );
+#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
+   sha256_opt_rounds( state_out, input, state_in );
+#undef load_msg
 }
 
 // 2 way double buffered
 
-void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
-                              const void *msg_X, const void *msg_Y,
-                              const uint32_t *in_X, const uint32_t *in_Y )
-{
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
-
-    // Load initial values
-    TMP_X = _mm_load_si128( (__m128i*) &in_X[0] );
-    STATE1_X = _mm_load_si128( (__m128i*) &in_X[4] );
-    TMP_Y = _mm_load_si128( (__m128i*) &in_Y[0] );
-    STATE1_Y = _mm_load_si128( (__m128i*) &in_Y[4] );
-
-    TMP_X = _mm_shuffle_epi32( TMP_X, 0xB1 );            // CDAB
-    TMP_Y = _mm_shuffle_epi32( TMP_Y, 0xB1 ); 
-    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0x1B );      // EFGH
-    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0x1B ); 
-    STATE0_X = _mm_alignr_epi8( TMP_X, STATE1_X, 8 );    // ABEF
-    STATE0_Y = _mm_alignr_epi8( TMP_Y, STATE1_Y, 8 ); 
-    STATE1_X = _mm_blend_epi16( STATE1_X, TMP_X, 0xF0 ); // CDGH
-    STATE1_Y = _mm_blend_epi16( STATE1_Y, TMP_Y, 0xF0 ); 
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128( (const __m128i*) msg_X );
-    TMSG0_Y = _mm_load_si128( (const __m128i*) msg_Y );
-    TMP_X = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128( (const __m128i*) (msg_X+16) );
-    TMSG1_Y = _mm_load_si128( (const __m128i*) (msg_Y+16) );
-    TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
-    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128( (const __m128i*) (msg_X+32) );
-    TMSG2_Y = _mm_load_si128( (const __m128i*) (msg_Y+32) );
-    TMP_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
-    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128( (const __m128i*) (msg_X+48) );
-    TMSG3_Y = _mm_load_si128( (const __m128i*) (msg_Y+48) );
-    TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
-    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
-    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
-    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
-    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
-    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
-    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
-    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
-    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
-    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
-    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
-    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
-    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
-    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
-    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
-    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
-    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
-    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
-    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
-    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
-    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
-    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
-    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
-    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
-    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
-    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
-    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
-    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
-    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
-    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
-    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
-    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
-    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
-    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
-    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
-    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
-    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
-    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
-    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
-    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
-    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
-    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
-    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
-    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
-    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
-    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
-    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32( STATE0_X, ABEF_SAVE_X );
-    STATE1_X = _mm_add_epi32( STATE1_X, CDGH_SAVE_X );
-    STATE0_Y = _mm_add_epi32( STATE0_Y, ABEF_SAVE_Y );
-    STATE1_Y = _mm_add_epi32( STATE1_Y, CDGH_SAVE_Y );
-
-    TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B );        // FEBA
-    TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B ); 
-    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 );     // DCHG
-    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 ); 
-    STATE0_X = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0); // DCBA
-    STATE0_Y = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0);
-    STATE1_X = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );   // ABEF
-    STATE1_Y = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
-
-    // Save state
-    _mm_store_si128( (__m128i*) &out_X[0], STATE0_X );
-    _mm_store_si128( (__m128i*) &out_X[4], STATE1_X );
-    _mm_store_si128( (__m128i*) &out_Y[0], STATE0_Y );
-    _mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y );
+#define sha256_ni2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y ) \
+{ \
+    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; \
+    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; \
+    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; \
+    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; \
+    __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y; \
+\
+    TMP_X = _mm_load_si128( (__m128i*) &in_X[0] ); \
+    STATE1_X = _mm_load_si128( (__m128i*) &in_X[4] ); \
+    TMP_Y = _mm_load_si128( (__m128i*) &in_Y[0] ); \
+    STATE1_Y = _mm_load_si128( (__m128i*) &in_Y[4] ); \
+\
+    TMP_X = _mm_shuffle_epi32( TMP_X, 0xB1 ); \
+    TMP_Y = _mm_shuffle_epi32( TMP_Y, 0xB1 ); \
+    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0x1B ); \
+    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0x1B );  \
+    STATE0_X = _mm_alignr_epi8( TMP_X, STATE1_X, 8 ); \
+    STATE0_Y = _mm_alignr_epi8( TMP_Y, STATE1_Y, 8 ); \
+    STATE1_X = _mm_blend_epi16( STATE1_X, TMP_X, 0xF0 ); \
+    STATE1_Y = _mm_blend_epi16( STATE1_Y, TMP_Y, 0xF0 ); \
+\
+    ABEF_SAVE_X = STATE0_X; \
+    ABEF_SAVE_Y = STATE0_Y; \
+    CDGH_SAVE_X = STATE1_X; \
+    CDGH_SAVE_Y = STATE1_Y; \
+\
+    TMSG0_X = load_msg( msg_X, 0 ); \
+    TMSG0_Y = load_msg( msg_Y, 0 ); \
+    TMSG1_X = load_msg( msg_X, 1 ); \
+    TMSG1_Y = load_msg( msg_Y, 1 ); \
+    TMSG2_X = load_msg( msg_X, 2 ); \
+    TMSG2_Y = load_msg( msg_Y, 2 ); \
+    TMSG3_X = load_msg( msg_X, 3 ); \
+    TMSG3_Y = load_msg( msg_Y, 3 ); \
+    /* Rounds 0-3 */ \
+    TMP_X = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL ); \
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    /* Rounds 4-7 */ \
+    TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL ); \
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); \
+    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); \
+    /* Rounds 8-11 */ \
+    TMP_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL ); \
+    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); \
+    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); \
+    /* Rounds 12-15 */ \
+    TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL ); \
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); \
+    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); \
+    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); \
+    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); \
+    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); \
+    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); \
+    /* Rounds 16-19 */ \
+    TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL ); \
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); \
+    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); \
+    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); \
+    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); \
+    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); \
+    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); \
+    /* Rounds 20-23 */ \
+    TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL ); \
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); \
+    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); \
+    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); \
+    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); \
+    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); \
+    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); \
+    /* Rounds 24-27 */ \
+    TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL ); \
+    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); \
+    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); \
+    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); \
+    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); \
+    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); \
+    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); \
+    /* Rounds 28-31 */ \
+    TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL ); \
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); \
+    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); \
+    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); \
+    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); \
+    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); \
+    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); \
+    /* Rounds 32-35 */ \
+    TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL ); \
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); \
+    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); \
+    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); \
+    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); \
+    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); \
+    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); \
+    /* Rounds 36-39 */ \
+    TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL ); \
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); \
+    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); \
+    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); \
+    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); \
+    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); \
+    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); \
+    /* Rounds 40-43 */ \
+    TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL ); \
+    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); \
+    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); \
+    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); \
+    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); \
+    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); \
+    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); \
+    /* Rounds 44-47 */ \
+    TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL ); \
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); \
+    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); \
+    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); \
+    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); \
+    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); \
+    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); \
+    /* Rounds 48-51*/ \
+    TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL ); \
+    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); \
+    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); \
+    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); \
+    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); \
+    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); \
+    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); \
+    /* Rounds 52-55 */ \
+    TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL ); \
+    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); \
+    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); \
+    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); \
+    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); \
+    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    /* Rounds 56-59 */ \
+    TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL ); \
+    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); \
+    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); \
+    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); \
+    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); \
+    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); \
+    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+    /* Rounds 60-63 */ \
+    TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL ); \
+    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \
+    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \
+    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \
+    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \
+    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \
+    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \
+    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \
+    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \
+\
+    STATE0_X = _mm_add_epi32( STATE0_X, ABEF_SAVE_X ); \
+    STATE1_X = _mm_add_epi32( STATE1_X, CDGH_SAVE_X ); \
+    STATE0_Y = _mm_add_epi32( STATE0_Y, ABEF_SAVE_Y ); \
+    STATE1_Y = _mm_add_epi32( STATE1_Y, CDGH_SAVE_Y ); \
+    TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); \
+    TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B ); \
+    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); \
+    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 ); \
+    STATE0_X = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0); \
+    STATE0_Y = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0); \
+    STATE1_X = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); \
+    STATE1_Y = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 ); \
+    _mm_store_si128( (__m128i*) &out_X[0], STATE0_X ); \
+    _mm_store_si128( (__m128i*) &out_X[4], STATE1_X ); \
+    _mm_store_si128( (__m128i*) &out_Y[0], STATE0_Y ); \
+    _mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \
 }
 
-void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
                                  const void *msg_X, const void *msg_Y,
                                  const uint32_t *in_X, const uint32_t *in_Y )
 {
-    __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
-    __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, BSWAP32;
-    __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
-    __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
-    __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
+#define load_msg( m, i ) casti_v128( m, i )
+ sha256_ni2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y );
+#undef load_msg
+}
 
-    // Load initial values
-    TMP_X = _mm_load_si128( (__m128i*) &in_X[0] );
-    STATE1_X = _mm_load_si128( (__m128i*) &in_X[4] );
-    TMP_Y = _mm_load_si128( (__m128i*) &in_Y[0] );
-    STATE1_Y = _mm_load_si128( (__m128i*) &in_Y[4] );
-    BSWAP32 = _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
-
-    TMP_X = _mm_shuffle_epi32( TMP_X, 0xB1 );            // CDAB
-    TMP_Y = _mm_shuffle_epi32( TMP_Y, 0xB1 );
-    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0x1B );      // EFGH
-    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0x1B );
-    STATE0_X = _mm_alignr_epi8( TMP_X, STATE1_X, 8 );    // ABEF
-    STATE0_Y = _mm_alignr_epi8( TMP_Y, STATE1_Y, 8 );
-    STATE1_X = _mm_blend_epi16( STATE1_X, TMP_X, 0xF0 ); // CDGH
-    STATE1_Y = _mm_blend_epi16( STATE1_Y, TMP_Y, 0xF0 );
-
-    // Save current hash
-    ABEF_SAVE_X = STATE0_X;
-    ABEF_SAVE_Y = STATE0_Y;
-    CDGH_SAVE_X = STATE1_X;
-    CDGH_SAVE_Y = STATE1_Y;
-
-    // Rounds 0-3
-    TMSG0_X = _mm_load_si128( (const __m128i*) (msg_X) );
-    TMSG0_Y = _mm_load_si128( (const __m128i*) (msg_Y) );
-    TMP_X = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL );
-    TMSG0_X = _mm_shuffle_epi8( TMSG0_X, BSWAP32 );
-    TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, BSWAP32 );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-
-    // Rounds 4-7
-    TMSG1_X = _mm_load_si128( (const __m128i*) (msg_X+16) );
-    TMSG1_Y = _mm_load_si128( (const __m128i*) (msg_Y+16) );
-    TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL );
-    TMSG1_X = _mm_shuffle_epi8( TMSG1_X, BSWAP32 );
-    TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, BSWAP32 );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
-    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
-
-    // Rounds 8-11
-    TMSG2_X = _mm_load_si128( (const __m128i*) (msg_X+32) );
-    TMSG2_Y = _mm_load_si128( (const __m128i*) (msg_Y+32) );
-    TMP_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL );
-    TMSG2_X = _mm_shuffle_epi8( TMSG2_X, BSWAP32 );
-    TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, BSWAP32 );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
-    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
-
-    // Rounds 12-15
-    TMSG3_X = _mm_load_si128( (const __m128i*) (msg_X+48) );
-    TMSG3_Y = _mm_load_si128( (const __m128i*) (msg_Y+48) );
-    TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL );
-    TMSG3_X = _mm_shuffle_epi8( TMSG3_X, BSWAP32 );
-    TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, BSWAP32 );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
-    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
-    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
-    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
-    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
-
-    // Rounds 16-19
-    TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
-    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
-    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
-    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
-    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
-
-    // Rounds 20-23
-    TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
-    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
-    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
-    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
-    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
-
-    // Rounds 24-27
-    TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
-    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
-    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
-    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
-    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
-
-    // Rounds 28-31
-    TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
-    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
-    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
-    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
-    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
-
-    // Rounds 32-35
-    TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
-    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
-    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
-    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
-    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
-
-    // Rounds 36-39
-    TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
-    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
-    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
-    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X );
-    TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y );
-
-    // Rounds 40-43
-    TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
-    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
-    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
-    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X );
-    TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y );
-
-    // Rounds 44-47
-    TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 );
-    TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y );
-    TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X );
-    TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X );
-    TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y );
-
-    // Rounds 48-51
-    TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL );
-    MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 );
-    TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y );
-    TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X );
-    TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-    TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X );
-    TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y );
-
-    // Rounds 52-55
-    TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL );
-    MSG_X = _mm_add_epi32( TMSG1_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 );
-    TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y );
-    TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X );
-    TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-
-    // Rounds 56-59
-    TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL );
-    MSG_X = _mm_add_epi32( TMSG2_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 );
-    TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 );
-    TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y );
-    TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X );
-    TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
- 
-    // Rounds 60-63
-    TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL );
-    MSG_X = _mm_add_epi32( TMSG3_X, TMP_X );
-    MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X );
-    STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X );
-    STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y );
-    MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E );
-    MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E );
-    STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X );
-    STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y );
-
-    // Add values back to state
-    STATE0_X = _mm_add_epi32( STATE0_X, ABEF_SAVE_X );
-    STATE1_X = _mm_add_epi32( STATE1_X, CDGH_SAVE_X );
-    STATE0_Y = _mm_add_epi32( STATE0_Y, ABEF_SAVE_Y );
-    STATE1_Y = _mm_add_epi32( STATE1_Y, CDGH_SAVE_Y );
-
-    TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B );         // FEBA
-    TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B );
-    STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 );      // DCHG
-    STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 );
-    STATE0_X = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA
-    STATE0_Y = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 );
-    STATE1_X = _mm_alignr_epi8( STATE1_X, TMP_X, 8 );    // ABEF
-    STATE1_Y = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 ); 
-
-    // Save state
-    _mm_store_si128( (__m128i*) &out_X[0], STATE0_X );
-    _mm_store_si128( (__m128i*) &out_X[4], STATE1_X );
-    _mm_store_si128( (__m128i*) &out_Y[0], STATE0_Y );
-    _mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y );
+void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
+ sha256_ni2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y );
+#undef load_msg
 }
 
 // The next 2 functions work together to seperate the low frequency data
@@ -1122,7 +567,7 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
    casti_m128i( ostate, 1 ) = STATE1;
 }
 
-void sha256_ni2way_final_rounds( uint32_t *out_X, uint32_t *out_Y,
+void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y,
                  const void *msg_X, const void *msg_Y,
                  const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
                  const uint32_t *state_save_X, const uint32_t *state_save_Y )
@@ -1414,7 +859,447 @@ void sha256_ni2way_final_rounds( uint32_t *out_X, uint32_t *out_Y,
     casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 );
 }
 
-#endif
+#endif     // SHA
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
+
+#pragma message "NEON SHA2 for sha256"
+
+static const uint32_t K256[64] =
+{
+   0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+   0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+   0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+   0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+   0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+   0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+   0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+   0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+   0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+   0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+   0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+   0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+   0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+   0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+   0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define sha256_neon_rounds( state_out, input, state_in ) \
+{ \
+    uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \
+    uint32x4_t MSG0, MSG1, MSG2, MSG3; \
+    uint32x4_t TMP0, TMP1, TMP2; \
+\
+    STATE0 = vld1q_u32( state_in   ); \
+    STATE1 = vld1q_u32( state_in+4 ); \
+    ABEF_SAVE = STATE0; \
+    CDGH_SAVE = STATE1; \
+\
+    MSG0 = load_msg( input, 0 ); \
+    MSG1 = load_msg( input, 1 ); \
+    MSG2 = load_msg( input, 2 ); \
+    MSG3 = load_msg( input, 3 ); \
+    TMP0 = vaddq_u32( MSG0, casti_v128( K256, 0 ) ); \
+    /* Rounds 0-3 */ \
+    MSG0 = vsha256su0q_u32( MSG0, MSG1 ); \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG1, casti_v128( K256, 1 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    MSG0 = vsha256su1q_u32( MSG0, MSG2, MSG3 ); \
+    /* Rounds 4-7 */ \
+    MSG1 = vsha256su0q_u32( MSG1, MSG2 ); \
+    TMP2 = STATE0; \
+    TMP0 = vaddq_u32( MSG2, casti_v128( K256, 2 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    MSG1 = vsha256su1q_u32( MSG1, MSG3, MSG0 ); \
+    /* Rounds 8-11 */ \
+    MSG2 = vsha256su0q_u32( MSG2, MSG3 ); \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG3, casti_v128( K256, 3 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    MSG2 = vsha256su1q_u32( MSG2, MSG0, MSG1 ); \
+    /* Rounds 12-15 */ \
+    MSG3 = vsha256su0q_u32( MSG3, MSG0 ); \
+    TMP2 = STATE0; \
+    TMP0 = vaddq_u32( MSG0, casti_v128( K256, 4 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    MSG3 = vsha256su1q_u32( MSG3, MSG1, MSG2 ); \
+    /* Rounds 16-19 */ \
+    MSG0 = vsha256su0q_u32( MSG0, MSG1 ); \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG1, casti_v128( K256, 5 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    MSG0 = vsha256su1q_u32( MSG0, MSG2, MSG3 ); \
+    /* Rounds 20-23 */ \
+    MSG1 = vsha256su0q_u32( MSG1, MSG2 ); \
+    TMP2 = STATE0; \
+    TMP0 = vaddq_u32( MSG2, casti_v128( K256, 6 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    MSG1 = vsha256su1q_u32( MSG1, MSG3, MSG0 ); \
+    /* Rounds 24-27 */ \
+    MSG2 = vsha256su0q_u32( MSG2, MSG3 ); \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG3, casti_v128( K256, 7 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    MSG2 = vsha256su1q_u32( MSG2, MSG0, MSG1 ); \
+    /* Rounds 28-31 */ \
+    MSG3 = vsha256su0q_u32( MSG3, MSG0 ); \
+    TMP2 = STATE0; \
+    TMP0 = vaddq_u32( MSG0, casti_v128( K256, 8 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    MSG3 = vsha256su1q_u32( MSG3, MSG1, MSG2 ); \
+    /* Rounds 32-35 */ \
+    MSG0 = vsha256su0q_u32( MSG0, MSG1 ); \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG1, casti_v128( K256, 9 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    MSG0 = vsha256su1q_u32( MSG0, MSG2, MSG3 ); \
+    /* Rounds 36-39 */ \
+    MSG1 = vsha256su0q_u32( MSG1, MSG2 ); \
+    TMP2 = STATE0; \
+    TMP0 = vaddq_u32( MSG2, casti_v128( K256, 10 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    MSG1 = vsha256su1q_u32( MSG1, MSG3, MSG0 ); \
+    /* Rounds 40-43 */ \
+    MSG2 = vsha256su0q_u32( MSG2, MSG3 ); \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG3, casti_v128( K256, 11 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    MSG2 = vsha256su1q_u32( MSG2, MSG0, MSG1 ); \
+    /* Rounds 44-47 */ \
+    MSG3 = vsha256su0q_u32( MSG3, MSG0 ); \
+    TMP2 = STATE0; \
+    TMP0 = vaddq_u32( MSG0, casti_v128( K256, 12 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    MSG3 = vsha256su1q_u32( MSG3, MSG1, MSG2 ); \
+    /* Rounds 48-51 */ \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG1, casti_v128( K256, 13 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    /* Rounds 52-55 */ \
+    TMP2 = STATE0; \
+    TMP0 = vaddq_u32( MSG2, casti_v128( K256, 14 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    /* Rounds 56-59 */ \
+    TMP2 = STATE0; \
+    TMP1 = vaddq_u32( MSG3, casti_v128( K256, 15 ) ); \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \
+    /* Rounds 60-63 */ \
+    TMP2 = STATE0; \
+    STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \
+    STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \
+    STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \
+    STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \
+    vst1q_u32( state_out  , STATE0 ); \
+    vst1q_u32( state_out+4, STATE1 ); \
+}
+
+void sha256_neon_transform_be( uint32_t *state_out, const void *input,
+                               const uint32_t *state_in )
+{
+#define load_msg( m, i )  v128_bswap32( casti_v128( m, i ) );
+   sha256_neon_rounds( state_out, input, state_in );
+#undef load_msg
+}
+
+void sha256_neon_transform_le( uint32_t *state_out, const void *input,
+                               const uint32_t *state_in )
+{
+#define load_msg( m, i )  casti_v128( m, i );
+   sha256_neon_rounds( state_out, input, state_in );
+#undef load_msg
+}
+
+#define sha256_neon2x_rounds( state_out_X, state_out_Y, input_X, \
+                              input_Y, state_in_X, state_in_Y ) \
+{ \
+    uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \
+    uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \
+    uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \
+    uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \
+    uint32x4_t TMP0_X, TMP1_X, TMP2_X; \
+    uint32x4_t TMP0_Y, TMP1_Y, TMP2_Y; \
+\
+    STATE0_X = vld1q_u32( state_in_X   ); \
+    STATE0_Y = vld1q_u32( state_in_Y   ); \
+    STATE1_X = vld1q_u32( state_in_X+4 ); \
+    STATE1_Y = vld1q_u32( state_in_Y+4 ); \
+    ABEF_SAVE_X = STATE0_X; \
+    ABEF_SAVE_Y = STATE0_Y; \
+    CDGH_SAVE_X = STATE1_X; \
+    CDGH_SAVE_Y = STATE1_Y; \
+\
+    MSG0_X = load_msg( input_X, 0 ); \
+    MSG0_Y = load_msg( input_Y, 0 ); \
+    MSG1_X = load_msg( input_X, 1 ); \
+    MSG1_Y = load_msg( input_Y, 1 ); \
+    MSG2_X = load_msg( input_X, 2 ); \
+    MSG2_Y = load_msg( input_Y, 2 ); \
+    MSG3_X = load_msg( input_X, 3 ); \
+    MSG3_Y = load_msg( input_Y, 3 ); \
+    TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 0 ) ); \
+    TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 0 ) ); \
+    /* Rounds 0-3 */ \
+    MSG0_X = vsha256su0q_u32( MSG0_X, MSG1_X ); \
+    MSG0_Y = vsha256su0q_u32( MSG0_Y, MSG1_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 1 ) ); \
+    TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 1 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    MSG0_X = vsha256su1q_u32( MSG0_X, MSG2_X, MSG3_X ); \
+    MSG0_Y = vsha256su1q_u32( MSG0_Y, MSG2_Y, MSG3_Y ); \
+    /* Rounds 4-7 */ \
+    MSG1_X = vsha256su0q_u32( MSG1_X, MSG2_X ); \
+    MSG1_Y = vsha256su0q_u32( MSG1_Y, MSG2_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 2 ) ); \
+    TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 2 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    MSG1_X = vsha256su1q_u32( MSG1_X, MSG3_X, MSG0_X ); \
+    MSG1_Y = vsha256su1q_u32( MSG1_Y, MSG3_Y, MSG0_Y ); \
+    /* Rounds 8-11 */ \
+    MSG2_X = vsha256su0q_u32( MSG2_X, MSG3_X ); \
+    MSG2_Y = vsha256su0q_u32( MSG2_Y, MSG3_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 3 ) ); \
+    TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 3 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    MSG2_X = vsha256su1q_u32( MSG2_X, MSG0_X, MSG1_X ); \
+    MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \
+    /* Rounds 12-15 */ \
+    MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \
+    MSG3_Y = vsha256su0q_u32( MSG3_Y, MSG0_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 4 ) ); \
+    TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 4 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    MSG3_X = vsha256su1q_u32( MSG3_X, MSG1_X, MSG2_X ); \
+    MSG3_Y = vsha256su1q_u32( MSG3_Y, MSG1_Y, MSG2_Y ); \
+    /* Rounds 16-19 */ \
+    MSG0_X = vsha256su0q_u32( MSG0_X, MSG1_X ); \
+    MSG0_Y = vsha256su0q_u32( MSG0_Y, MSG1_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 5 ) ); \
+    TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 5 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    MSG0_X = vsha256su1q_u32( MSG0_X, MSG2_X, MSG3_X ); \
+    MSG0_Y = vsha256su1q_u32( MSG0_Y, MSG2_Y, MSG3_Y ); \
+    /* Rounds 20-23 */ \
+    MSG1_X = vsha256su0q_u32( MSG1_X, MSG2_X ); \
+    MSG1_Y = vsha256su0q_u32( MSG1_Y, MSG2_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 6 ) ); \
+    TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 6 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    MSG1_X = vsha256su1q_u32( MSG1_X, MSG3_X, MSG0_X ); \
+    MSG1_Y = vsha256su1q_u32( MSG1_Y, MSG3_Y, MSG0_Y ); \
+    /* Rounds 24-27 */ \
+    MSG2_X = vsha256su0q_u32( MSG2_X, MSG3_X ); \
+    MSG2_Y = vsha256su0q_u32( MSG2_Y, MSG3_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 7 ) ); \
+    TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 7 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    MSG2_X = vsha256su1q_u32( MSG2_X, MSG0_X, MSG1_X ); \
+    MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \
+    /* Rounds 28-31 */ \
+    MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \
+    MSG3_Y = vsha256su0q_u32( MSG3_Y, MSG0_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 8 ) ); \
+    TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 8 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    MSG3_X = vsha256su1q_u32( MSG3_X, MSG1_X, MSG2_X ); \
+    MSG3_Y = vsha256su1q_u32( MSG3_Y, MSG1_Y, MSG2_Y ); \
+    /* Rounds 32-35 */ \
+    MSG0_X = vsha256su0q_u32( MSG0_X, MSG1_X ); \
+    MSG0_Y = vsha256su0q_u32( MSG0_Y, MSG1_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 9 ) ); \
+    TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 9 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    MSG0_X = vsha256su1q_u32( MSG0_X, MSG2_X, MSG3_X ); \
+    MSG0_Y = vsha256su1q_u32( MSG0_Y, MSG2_Y, MSG3_Y ); \
+    /* Rounds 36-39 */ \
+    MSG1_X = vsha256su0q_u32( MSG1_X, MSG2_X ); \
+    MSG1_Y = vsha256su0q_u32( MSG1_Y, MSG2_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 10 ) ); \
+    TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 10 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    MSG1_X = vsha256su1q_u32( MSG1_X, MSG3_X, MSG0_X ); \
+    MSG1_Y = vsha256su1q_u32( MSG1_Y, MSG3_Y, MSG0_Y ); \
+    /* Rounds 40-43 */ \
+    MSG2_X = vsha256su0q_u32( MSG2_X, MSG3_X ); \
+    MSG2_Y = vsha256su0q_u32( MSG2_Y, MSG3_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 11 ) ); \
+    TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 11 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    MSG2_X = vsha256su1q_u32( MSG2_X, MSG0_X, MSG1_X ); \
+    MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \
+    /* Rounds 44-47 */ \
+    MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \
+    MSG3_Y = vsha256su0q_u32( MSG3_X, MSG0_Y ); \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 12 ) ); \
+    TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 12 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    MSG3_X = vsha256su1q_u32( MSG3_X, MSG1_X, MSG2_X ); \
+    MSG3_Y = vsha256su1q_u32( MSG3_Y, MSG1_Y, MSG2_Y ); \
+    /* Rounds 48-51 */ \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 13 ) ); \
+    TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 13 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    /* Rounds 52-55 */ \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 14 ) ); \
+    TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 14 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    /* Rounds 56-59 */ \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 15 ) ); \
+    TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 15 ) ); \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \
+    /* Rounds 60-63 */ \
+    TMP2_X = STATE0_X; \
+    TMP2_Y = STATE0_Y; \
+    STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \
+    STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \
+    STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \
+    STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \
+    STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \
+    STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \
+    STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \
+    STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \
+    vst1q_u32( state_out_X  , STATE0_X ); \
+    vst1q_u32( state_out_Y  , STATE0_Y ); \
+    vst1q_u32( state_out_X+4, STATE1_X ); \
+    vst1q_u32( state_out_Y+4, STATE1_Y ); \
+}   
+
+void sha256_neon2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
+                                 const void *msg_X, const void *msg_Y,
+                                 const uint32_t *in_X, const uint32_t *in_Y )
+{
+#define load_msg( m, i ) casti_v128( m, i )
+ sha256_neon2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y );
+#undef load_msg
+}
+
+void sha256_neon2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y )
+{
+#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) )
+ sha256_neon2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y );
+#undef load_msg
+}
+
+//TODO finish prehash for ARM
+
+void sha256_neon_prehash_3rounds( uint32_t *ostate, const void *msg,
+                                  uint32_t *sstate, const uint32_t *istate )
+{
+    uint32x4_t STATE0, STATE1, MSG0, MSG1, TMP0, TMP1;
+
+    STATE0 = casti_v128( istate, 0 ); 
+    STATE1 = casti_v128( istate, 1 ); 
+
+    // Save current hash
+    casti_v128( sstate, 0 ) = STATE0;
+    casti_v128( sstate, 1 ) = STATE1;
+
+    MSG0 = casti_v128( msg, 0 );
+    MSG1 = casti_v128( msg, 1 );
+    TMP0 = vaddq_u32( MSG0, casti_v128( K256, 0 ) ); 
+
+    /* Rounds 0-3 */ \
+    MSG0 = vsha256su0q_u32( MSG0, MSG1 ); 
+    TMP1 = STATE0; 
+    casti_v128( ostate, 0 ) = vsha256hq_u32( STATE0, STATE1, TMP0 ); 
+    casti_v128( ostate, 1 ) = vsha256h2q_u32( STATE1, TMP1, TMP0 ); 
+}   
+
+
+#endif // arm
 
 
 void sha256_ctx_init( sha256_context *ctx )
diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h
index 763b405..f516922 100644
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -25,7 +25,7 @@ void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
 void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
                           const uint32_t *state_in );
 
-#if defined(__SHA__)
+#if defined(__x86_64__) && defined(__SHA__)
 
 void sha256_opt_transform_le( uint32_t *state_out, const void *input,
                            const uint32_t *state_in );
@@ -33,34 +33,67 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
 void sha256_opt_transform_be( uint32_t *state_out, const void *input,
                            const uint32_t *state_in );
 
-// 2 way with interleaved instructions
-void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
+// 2 way serial with interleaved instructions
+void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
                               const void *msg_X, const void *msg_Y,
                               const uint32_t *in_X, const uint32_t *in_Y );
 
-void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
+void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
                               const void *msg_X, const void *msg_Y,
                               const uint32_t *in_X, const uint32_t *in_Y );
 
 void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
                               uint32_t *sstate, const uint32_t *istate );
 
-void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
+void sha256_ni2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
                  const void *msg_X, const void *msg_Y,
                  const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
                  const uint32_t *state_save_X, const uint32_t *state_save_Y );
 
-// Select target
-// with SHA...
-#define sha256_transform_le sha256_opt_transform_le
-#define sha256_transform_be sha256_opt_transform_be
+#define sha256_transform_le         sha256_opt_transform_le
+#define sha256_transform_be         sha256_opt_transform_be
+#define sha256_2x_transform_le      sha256_ni2x_transform_le
+#define sha256_2x_transform_be      sha256_ni2x_transform_be
+#define sha256_prehash_3rounds      sha256_ni_prehash_3rounds
+#define sha256_2x_final_rounds      sha256_ni2x_final_rounds
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+void sha256_neon_transform_be( uint32_t *state_out, const void *input,
+                               const uint32_t *state_in );
+void sha256_neon_transform_le( uint32_t *state_out, const void *input,
+                               const uint32_t *state_in );
+
+void sha256_neon2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+void sha256_neon2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
+                              const void *msg_X, const void *msg_Y,
+                              const uint32_t *in_X, const uint32_t *in_Y );
+
+void sha256_neon_prehash_3rounds( uint32_t *ostate, const void *msg,
+                              uint32_t *sstate, const uint32_t *istate );
+
+void sha256_neon2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
+                 const void *msg_X, const void *msg_Y,
+                 const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
+                 const uint32_t *state_save_X, const uint32_t *state_save_Y );
+
+#define sha256_transform_le         sha256_neon_transform_le
+#define sha256_transform_be         sha256_neon_transform_be
+#define sha256_2x_transform_le      sha256_neon2x_transform_le
+#define sha256_2x_transform_be      sha256_neon2x_transform_be
+#define sha256_prehash_3rounds      sha256_neon_prehash_3rounds
+#define sha256_2x_final_rounds      sha256_neon2x_final_rounds
 
 #else
-// without SHA...
+// without HW acceleration...
 #include "sph_sha2.h"
 
-#define sha256_transform_le sph_sha256_transform_le
-#define sha256_transform_be sph_sha256_transform_be
+#define sha256_transform_le         sph_sha256_transform_le
+#define sha256_transform_be         sph_sha256_transform_be
+#define sha256_prehash_3rounds      sph_sha256_prehash_3rounds
 
 #endif
 
@@ -122,14 +155,12 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
 
 #endif  // AVX2
 
-#if defined(__SSE2__)
-
 // SHA-256 4 way
 
 typedef struct
 {
-   __m128i buf[64>>2];
-   __m128i val[8];
+   v128_t buf[64>>2];
+   v128_t val[8];
    uint32_t count_high, count_low;
 } sha256_4way_context __attribute__ ((aligned (32)));
 
@@ -138,17 +169,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
                          size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 void sha256_4way_full( void *dst, const void *data, size_t len );
-void sha256_4way_transform_le( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_transform_be( __m128i *state_out,  const __m128i *data,
-                            const __m128i *state_in );
-void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
-                                   const __m128i *W, const __m128i *state_in );
-void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
-        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
-int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                                   const __m128i *state_in, const uint32_t *target );
+void sha256_4way_transform_le( v128_t *state_out,  const v128_t *data,
+                            const v128_t *state_in );
+void sha256_4way_transform_be( v128_t *state_out,  const v128_t *data,
+                            const v128_t *state_in );
+void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
+                                   const v128_t *W, const v128_t *state_in );
+void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
+        const v128_t *state_in, const v128_t *state_mid, const v128_t *X );
+int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
+                                   const v128_t *state_in, const uint32_t *target );
 
-#endif  // SSE2
 
 #endif
diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c
index b983515..a028f94 100644
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -32,11 +32,11 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+   const v128_t shuf_bswap32 =
+           v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
 
    // hash first 64 byte block of data
-   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+   sha256_transform_le( mstatea, pdata, sha256_iv );
 
    // fill & pad second bock without nonce
    memcpy( block1a, pdata + 16, 12 );
@@ -48,7 +48,7 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
    memset( block1b + 5, 0, 40 );
    block1a[15] = block1b[15] = 80*8; // bit count
 
-   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+   sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);
 
    // Pad third block
    block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -61,18 +61,18 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
       // Insert nonce for second block
       block1a[3] = n;
       block1b[3] = n+1;
-      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+      sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
                                   mstateb, mstateb, sstate, sstate );
 
-      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+      sha256_2x_transform_le( hasha, hashb, block2a, block2b,
                                   sha256_iv, sha256_iv );
 
       if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
       {
-          casti_m128i( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
-          casti_m128i( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) =
+               _mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
+          casti_v128( hasha, 1 ) =
+               _mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
           if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
           {
              pdata[19] = n;
@@ -81,10 +81,94 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
       }
       if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
       {
-         casti_m128i( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
-         casti_m128i( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) =
+               _mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
+         casti_v128( hashb, 1 ) =
+               _mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256D_NEON_SHA2)
+
+int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const v128_t shuf_bswap32 =
+           v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 80*8; // bit count
+
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 32*8; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
+                                  mstatea, mstatea );
+
+      sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
          if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
          {
             pdata[19] = n+1;
@@ -282,11 +366,11 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]    __attribute__ ((aligned (64)));
-   __m128i  block[16]    __attribute__ ((aligned (32)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  istate[8] __attribute__ ((aligned (32)));
-   __m128i  mstate[8]  __attribute__ ((aligned (32)));
+   v128_t  vdata[32]    __attribute__ ((aligned (64)));
+   v128_t  block[16]    __attribute__ ((aligned (32)));
+   v128_t  hash32[8]    __attribute__ ((aligned (32)));
+   v128_t  istate[8] __attribute__ ((aligned (32)));
+   v128_t  mstate[8]  __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
@@ -295,23 +379,23 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   __m128i *noncev = vdata + 19;
+   v128_t *noncev = vdata + 19;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-   const __m128i last_byte = v128_32( 0x80000000 );
-   const __m128i four = v128_32( 4 );
+   const v128_t last_byte = v128_32( 0x80000000 );
+   const v128_t four = v128_32( 4 );
 
    for ( int i = 0; i < 19; i++ )
        vdata[i] = v128_32( pdata[i] );
 
-   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+   *noncev = v128_set_32( n+ 3, n+ 2, n+1, n );
 
    vdata[16+4] = last_byte;
-   memset_zero_128( vdata+16 + 5, 10 );
+   v128_memset_zero( vdata+16 + 5, 10 );
    vdata[16+15] = v128_32( 80*8 );
 
    block[ 8] = last_byte;
-   memset_zero_128( block + 9, 6 );
+   v128_memset_zero( block + 9, 6 );
    block[15] = v128_32( 32*8 );
    
    // initialize state
@@ -332,7 +416,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
       sha256_4way_transform_le( block,  vdata+16, mstate  );
       sha256_4way_transform_le( hash32, block, istate );
 
-      mm128_block_bswap_32( hash32, hash32 );
+      v128_block_bswap32( hash32, hash32 );
 
       for ( int lane = 0; lane < 4; lane++ )
       if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -344,7 +428,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
          }
       }
-      *noncev = _mm_add_epi32( *noncev, four );
+      *noncev = v128_add32( *noncev, four );
       n += 4;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h
index 85e5267..ce459e9 100644
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -8,6 +8,8 @@
   #define SHA256D_16WAY 1
 #elif defined(__SHA__)
   #define SHA256D_SHA 1
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
+  #define SHA256D_NEON_SHA2 1
 #elif defined(__AVX2__)
   #define SHA256D_8WAY 1
 #else
@@ -41,5 +43,12 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
 
 #endif
 
+#if defined(SHA256D_NEON_SHA2)
+
+int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
 #endif
 
diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c
index 588425d..d12f796 100644
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -9,6 +9,8 @@
   #define SHA256DT_16WAY 1
 #elif defined(__SHA__)
   #define SHA256DT_SHA 1
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
+  #define SHA256DT_NEON_SHA2 1
 #elif defined(__AVX2__)
   #define SHA256DT_8WAY 1
 #else
@@ -42,11 +44,11 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+   const v128_t shuf_bswap32 =
+           v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
 
    // hash first 64 byte block of data
-   sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
+   sha256_transform_le( mstatea, pdata, sha256dt_iv );
 
    // fill & pad second bock without nonce
    memcpy( block1a, pdata + 16, 12 );
@@ -57,7 +59,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
    memset( block1b + 5, 0, 40 );
    block1a[15] = block1b[15] = 0x480; // funky bit count
 
-   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+   sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);
 
    // Pad third block
    block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -70,18 +72,16 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
       // Insert nonce for second block
       block1a[3] = n;
       block1b[3] = n+1;
-      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+      sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
                                   mstateb, mstateb, sstate, sstate );
 
-      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+      sha256_2x_transform_le( hasha, hashb, block2a, block2b,
                                   sha256dt_iv, sha256dt_iv );
 
       if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
       {
-          casti_m128i( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
-          casti_m128i( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
           if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
           {
              pdata[19] = n;
@@ -90,10 +90,92 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
       }
       if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
       {
-         casti_m128i( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
-         casti_m128i( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256DT_NEON_SHA2)
+
+#pragma message  "SHA256DT  MEON SHA"
+
+int scanhash_sha256dt_neon_sha2( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]    __attribute__ ((aligned (32)));
+   uint32_t hashb[8]    __attribute__ ((aligned (32)));
+   uint32_t mstatea[8]  __attribute__ ((aligned (32)));
+   uint32_t sstate[8]   __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const v128_t shuf_bswap32 =
+           v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 byte block of data
+   sha256_neon_transform_le( mstatea, pdata, sha256dt_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 0x300; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
+                                  mstatea, mstatea );
+
+      sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256dt_iv, sha256dt_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
          if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
          {
             pdata[19] = n+1;
@@ -132,7 +214,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
    const int thr_id = mythr->id;
    const __m512i sixteen = v512_32( 16 );
    const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
 
    // prehash first block directly from pdata
@@ -227,7 +309,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
    const bool bench = opt_benchmark;
    const __m256i last_byte = v256_32( 0x80000000 );
    const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
 
    for ( int i = 0; i < 19; i++ )
@@ -291,11 +373,11 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]    __attribute__ ((aligned (64)));
-   __m128i  block[16]    __attribute__ ((aligned (32)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  initstate[8] __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   v128_t  vdata[32]    __attribute__ ((aligned (64)));
+   v128_t  block[16]    __attribute__ ((aligned (32)));
+   v128_t  hash32[8]    __attribute__ ((aligned (32)));
+   v128_t  initstate[8] __attribute__ ((aligned (32)));
+   v128_t  midstate[8]  __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
@@ -304,23 +386,23 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   __m128i *noncev = vdata + 19;
+   v128_t *noncev = vdata + 19;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-   const __m128i last_byte = v128_32( 0x80000000 );
-   const __m128i four = v128_32( 4 );
+   const v128_t last_byte = v128_32( 0x80000000 );
+   const v128_t four = v128_32( 4 );
 
    for ( int i = 0; i < 19; i++ )
        vdata[i] = v128_32( pdata[i] );
 
-   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+   *noncev = v128_set32( n+ 3, n+ 2, n+1, n );
 
    vdata[16+4] = last_byte;
-   memset_zero_128( vdata+16 + 5, 10 );
+   v128_memset_zero( vdata+16 + 5, 10 );
    vdata[16+15] = v128_32( 0x480 );
 
    block[ 8] = last_byte;
-   memset_zero_128( block + 9, 6 );
+   v128_memset_zero( block + 9, 6 );
    block[15] = v128_32( 0x300 );
    
    // initialize state
@@ -341,7 +423,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
       sha256_4way_transform_le( block,  vdata+16, midstate  );
       sha256_4way_transform_le( hash32, block, initstate );
 
-      mm128_block_bswap_32( hash32, hash32 );
+      v128_block_bswap32( hash32, hash32 );
 
       for ( int lane = 0; lane < 4; lane++ )
       if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -353,7 +435,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
          }
       }
-      *noncev = _mm_add_epi32( *noncev, four );
+      *noncev = v128_add32( *noncev, four );
       n += 4;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
@@ -371,11 +453,16 @@ bool register_sha256dt_algo( algo_gate_t* gate )
 #elif defined(SHA256DT_SHA)
     gate->optimizations = SHA_OPT;
     gate->scanhash = (void*)&scanhash_sha256dt_sha;    
+#elif defined(SHA256DT_NEON_SHA2)
+    gate->optimizations = SHA_OPT;
+    gate->scanhash = (void*)&scanhash_sha256dt_neon_sha2;
 #elif defined(SHA256DT_8WAY)
     gate->scanhash = (void*)&scanhash_sha256dt_8way;
-#else
+#elif defined(SHA256DT_4WAY)
     gate->scanhash = (void*)&scanhash_sha256dt_4way;
 #endif
+
+
     return true;
 }
 
diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c
index 0d07a39..abfe5a1 100644
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -188,7 +188,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   v128_t  *noncev = (v128_t*)vdata + 19;   // aligned
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
    const uint64_t htmax[] = {          0,
@@ -204,7 +204,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                                0xFFFF0000,
                                         0 };
 
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   v128_bswap32_intrlv80_4x32( vdata, pdata );
    sha256_4way_init( &sha256_ctx4 );
    sha256_4way_update( &sha256_ctx4, vdata, 64 );
 
@@ -212,7 +212,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
    {
       uint32_t mask = masks[m];
       do {
-         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
+         *noncev = v128_bswap32( v128_set32( n+3,n+2,n+1,n ) );
          pdata[19] = n;
 
          sha256q_4way_hash( hash, vdata );
diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c
index 90a2b7b..4eb428b 100644
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -45,7 +45,7 @@ int scanhash_sha256q( struct work *work, uint32_t max_nonce,
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
    sha256q_midstate( edata );
 
    do
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index c68abca..395dde3 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -131,11 +131,11 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-   const __m128i shuf_bswap32 =
-           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+//   const v128_t shuf_bswap32 =
+//           v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
 
    // hash first 64 byte block of data
-   sha256_opt_transform_le( mstatea, pdata, sha256_iv );
+   sha256_transform_le( mstatea, pdata, sha256_iv );
 
    // fill & pad second bock without nonce
    memcpy( block1a, pdata + 16, 12 );
@@ -147,7 +147,7 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
    memset( block1b + 5, 0, 40 );
    block1a[15] = block1b[15] = 0x480; // funky bit count
 
-   sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
+   sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);
 
    // Pad third block
    block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -160,19 +160,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
       // Insert nonce for second block
       block1a[3] = n;
       block1b[3] = n+1;
-      sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
+      sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
                                   mstateb, mstateb, sstate, sstate );
-      sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
+      sha256_2x_transform_le( block2a, block2b, block2a, block2b,
                                   sha256_iv, sha256_iv );
-      sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
+      sha256_2x_transform_le( hasha, hashb, block2a, block2b,
                                   sha256_iv, sha256_iv );
 
       if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
       {
-          casti_m128i( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
-          casti_m128i( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
           if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
           {
              pdata[19] = n;
@@ -181,10 +179,90 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
       }
       if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
       {
-         casti_m128i( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
-         casti_m128i( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
+         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hashb, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+#if defined(SHA256T_NEON_SHA2)
+
+int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block1a[16] __attribute__ ((aligned (64)));
+   uint32_t block1b[16] __attribute__ ((aligned (64)));
+   uint32_t block2a[16] __attribute__ ((aligned (64)));
+   uint32_t block2b[16] __attribute__ ((aligned (64)));
+   uint32_t hasha[8]   __attribute__ ((aligned (32)));
+   uint32_t hashb[8]   __attribute__ ((aligned (32)));
+   uint32_t mstatea[8] __attribute__ ((aligned (32)));
+   uint32_t sstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   // hash first 64 byte block of data
+   sha256_transform_le( mstatea, pdata, sha256_iv );
+
+   // fill & pad second bock without nonce
+   memcpy( block1a, pdata + 16, 12 );
+   memcpy( block1b, pdata + 16, 12 );
+   block1a[ 3] = 0;
+   block1b[ 3] = 0;
+   block1a[ 4] = block1b[ 4] = 0x80000000;
+   memset( block1a + 5, 0, 40 );
+   memset( block1b + 5, 0, 40 );
+   block1a[15] = block1b[15] = 0x480; // funky bit count
+
+   // Pad third block
+   block2a[ 8] = block2b[ 8] = 0x80000000;
+   memset( block2a + 9, 0, 24 );
+   memset( block2b + 9, 0, 24 );
+   block2a[15] = block2b[15] = 80*8; // bit count
+
+   do
+   {
+      // Insert nonce for second block
+      block1a[3] = n;
+      block1b[3] = n+1;
+      sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
+                                  mstatea, mstatea );
+      sha256_neon2x_transform_le( block2a, block2b, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+      sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
+      {
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
+          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hasha, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
+      {
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
          if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
          {
             pdata[19] = n+1;
@@ -295,13 +373,13 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m128i  vdata[32]    __attribute__ ((aligned (64)));
-   __m128i  block[16]    __attribute__ ((aligned (32)));
-   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  istate[8]    __attribute__ ((aligned (32)));
-   __m128i  mstate[8]   __attribute__ ((aligned (32)));
-//   __m128i  mstate2[8]   __attribute__ ((aligned (32)));
-//   __m128i  mexp_pre[8]  __attribute__ ((aligned (32)));
+   v128_t  vdata[32]    __attribute__ ((aligned (64)));
+   v128_t  block[16]    __attribute__ ((aligned (32)));
+   v128_t  hash32[8]    __attribute__ ((aligned (32)));
+   v128_t  istate[8]    __attribute__ ((aligned (32)));
+   v128_t  mstate[8]   __attribute__ ((aligned (32)));
+//   v128_t  mstate2[8]   __attribute__ ((aligned (32)));
+//   v128_t  mexp_pre[8]  __attribute__ ((aligned (32)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
    uint32_t *pdata = work->data;
@@ -310,23 +388,23 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
    const uint32_t first_nonce = pdata[19];
    const uint32_t last_nonce = max_nonce - 4;
    uint32_t n = first_nonce;
-   __m128i *noncev = vdata + 19;
+   v128_t *noncev = vdata + 19;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-   const __m128i last_byte = v128_32( 0x80000000 );
-   const __m128i four = v128_32( 4 );
+   const v128_t last_byte = v128_32( 0x80000000 );
+   const v128_t four = v128_32( 4 );
 
    for ( int i = 0; i < 19; i++ )
        vdata[i] = v128_32( pdata[i] );
 
-   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
+   *noncev = v128_set_32( n+ 3, n+ 2, n+1, n );
 
    vdata[16+4] = last_byte;
-   memset_zero_128( vdata+16 + 5, 10 );
+   v128_memset_zero( vdata+16 + 5, 10 );
    vdata[16+15] = v128_32( 80*8 ); // bit count
 
    block[ 8] = last_byte;
-   memset_zero_128( block + 9, 6 );
+   v128_memset_zero( block + 9, 6 );
    block[15] = v128_32( 32*8 ); // bit count
    
    // initialize state
@@ -353,10 +431,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
       sha256_4way_transform_le( block,  block, istate );
       sha256_4way_transform_le( hash32, block, istate );
 
-//      if ( unlikely( sha256_4way_transform_le_short(
-//                                  hash32, block, initstate, ptarget ) ))
-//      {
-         mm128_block_bswap_32( hash32, hash32 );
+         v128_block_bswap32( hash32, hash32 );
          for ( int lane = 0; lane < 4; lane++ )
          if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
          {
@@ -367,8 +442,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
                submit_solution( work, lane_hash, mythr );
             }
          }
-//       }
-       *noncev = _mm_add_epi32( *noncev, four );
+       *noncev = v128_add32( *noncev, four );
        n += 4;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );
    pdata[19] = n;
diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c
index e369f27..b29e2b2 100644
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -10,8 +10,11 @@ bool register_sha256t_algo( algo_gate_t* gate )
     gate->scanhash   = (void*)&scanhash_sha256t_sha;
 #elif defined(SHA256T_8WAY)
     gate->scanhash   = (void*)&scanhash_sha256t_8way;
-#else
+#elif defined(SHA256T_4WAY)
     gate->scanhash   = (void*)&scanhash_sha256t_4way;
+#else
+    gate->scanhash   = (void*)&scanhash_sha256t;
+
 #endif
     return true;
 }
@@ -22,16 +25,19 @@ bool register_sha256q_algo( algo_gate_t* gate )
 #if defined(SHA256T_16WAY)
     gate->scanhash   = (void*)&scanhash_sha256q_16way;
     gate->hash       = (void*)&sha256q_16way_hash;
-#elif defined(SHA256T_SHA)
-    gate->optimizations = SHA_OPT;
-    gate->scanhash   = (void*)&scanhash_sha256q;
-    gate->hash       = (void*)&sha256q_hash;
+//#elif defined(SHA256T_SHA)
+//    gate->optimizations = SHA_OPT;
+//    gate->scanhash   = (void*)&scanhash_sha256q;
+//    gate->hash       = (void*)&sha256q_hash;
 #elif defined(SHA256T_8WAY)
     gate->scanhash   = (void*)&scanhash_sha256q_8way;
     gate->hash       = (void*)&sha256q_8way_hash;
-#else
+#elif defined(SHA256T_4WAY)
     gate->scanhash   = (void*)&scanhash_sha256q_4way;
     gate->hash       = (void*)&sha256q_4way_hash;
+//#else
+//    gate->scanhash   = (void*)&scanhash_sha256q;
+//    gate->hash       = (void*)&sha256q_4way;
 #endif
     return true;
 }
diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h
index a20b3dd..db65ae4 100644
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -8,6 +8,8 @@
   #define SHA256T_16WAY 1
 #elif defined(__SHA__)
   #define SHA256T_SHA 1
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
+  #define SHA125DT_NEON_SHA2 1
 #elif defined(__AVX2__)
   #define SHA256T_8WAY 1
 #else
@@ -51,6 +53,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
 
 #endif
 
+#if defined(SHA256T_NEON_SHA2)
+
+int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+int sha256t_hash( void *output, const void *input );
+int scanhash_sha256t( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
 int sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c
index 758d6b0..d77c335 100644
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>
 
-#ifdef __SSE4_1__
+#if defined(__SSE4_1__) || defined(__ARM_NEON)
 
 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -1245,16 +1245,16 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #endif  // AVX2
 
 #define DECL_STATE   \
-	__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
+	v128_t A0, A1, A2, A3, A4, A5, A6, A7, \
 	        A8, A9, AA, AB; \
-	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
+	v128_t B0, B1, B2, B3, B4, B5, B6, B7, \
 	        B8, B9, BA, BB, BC, BD, BE, BF; \
-	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
+	v128_t C0, C1, C2, C3, C4, C5, C6, C7, \
 	        C8, C9, CA, CB, CC, CD, CE, CF; \
-	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
+	v128_t M0, M1, M2, M3, M4, M5, M6, M7, \
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
-   const __m128i FIVE  = v128_32( 5 ); \
-   const __m128i THREE = v128_32( 3 ); \
+   const v128_t FIVE  = v128_32( 5 ); \
+   const v128_t THREE = v128_32( 3 ); \
    uint32_t Wlow, Whigh;
 
 #define READ_STATE(state) do \
@@ -1429,96 +1429,84 @@ do { \
 
 #define INPUT_BLOCK_ADD \
 do { \
-    B0 = _mm_add_epi32( B0, M0 );\
-    B1 = _mm_add_epi32( B1, M1 );\
-    B2 = _mm_add_epi32( B2, M2 );\
-    B3 = _mm_add_epi32( B3, M3 );\
-    B4 = _mm_add_epi32( B4, M4 );\
-    B5 = _mm_add_epi32( B5, M5 );\
-    B6 = _mm_add_epi32( B6, M6 );\
-    B7 = _mm_add_epi32( B7, M7 );\
-    B8 = _mm_add_epi32( B8, M8 );\
-    B9 = _mm_add_epi32( B9, M9 );\
-    BA = _mm_add_epi32( BA, MA );\
-    BB = _mm_add_epi32( BB, MB );\
-    BC = _mm_add_epi32( BC, MC );\
-    BD = _mm_add_epi32( BD, MD );\
-    BE = _mm_add_epi32( BE, ME );\
-    BF = _mm_add_epi32( BF, MF );\
+    B0 = v128_add32( B0, M0 );\
+    B1 = v128_add32( B1, M1 );\
+    B2 = v128_add32( B2, M2 );\
+    B3 = v128_add32( B3, M3 );\
+    B4 = v128_add32( B4, M4 );\
+    B5 = v128_add32( B5, M5 );\
+    B6 = v128_add32( B6, M6 );\
+    B7 = v128_add32( B7, M7 );\
+    B8 = v128_add32( B8, M8 );\
+    B9 = v128_add32( B9, M9 );\
+    BA = v128_add32( BA, MA );\
+    BB = v128_add32( BB, MB );\
+    BC = v128_add32( BC, MC );\
+    BD = v128_add32( BD, MD );\
+    BE = v128_add32( BE, ME );\
+    BF = v128_add32( BF, MF );\
 } while (0)
 
 #define INPUT_BLOCK_SUB \
 do { \
-    C0 = _mm_sub_epi32( C0, M0 ); \
-    C1 = _mm_sub_epi32( C1, M1 ); \
-    C2 = _mm_sub_epi32( C2, M2 ); \
-    C3 = _mm_sub_epi32( C3, M3 ); \
-    C4 = _mm_sub_epi32( C4, M4 ); \
-    C5 = _mm_sub_epi32( C5, M5 ); \
-    C6 = _mm_sub_epi32( C6, M6 ); \
-    C7 = _mm_sub_epi32( C7, M7 ); \
-    C8 = _mm_sub_epi32( C8, M8 ); \
-    C9 = _mm_sub_epi32( C9, M9 ); \
-    CA = _mm_sub_epi32( CA, MA ); \
-    CB = _mm_sub_epi32( CB, MB ); \
-    CC = _mm_sub_epi32( CC, MC ); \
-    CD = _mm_sub_epi32( CD, MD ); \
-    CE = _mm_sub_epi32( CE, ME ); \
-    CF = _mm_sub_epi32( CF, MF ); \
+    C0 = v128_sub32( C0, M0 ); \
+    C1 = v128_sub32( C1, M1 ); \
+    C2 = v128_sub32( C2, M2 ); \
+    C3 = v128_sub32( C3, M3 ); \
+    C4 = v128_sub32( C4, M4 ); \
+    C5 = v128_sub32( C5, M5 ); \
+    C6 = v128_sub32( C6, M6 ); \
+    C7 = v128_sub32( C7, M7 ); \
+    C8 = v128_sub32( C8, M8 ); \
+    C9 = v128_sub32( C9, M9 ); \
+    CA = v128_sub32( CA, MA ); \
+    CB = v128_sub32( CB, MB ); \
+    CC = v128_sub32( CC, MC ); \
+    CD = v128_sub32( CD, MD ); \
+    CE = v128_sub32( CE, ME ); \
+    CF = v128_sub32( CF, MF ); \
 } while (0)
 
 #define XOR_W \
 do { \
-   A0 = _mm_xor_si128( A0, v128_32( Wlow ) ); \
-   A1 = _mm_xor_si128( A1, v128_32( Whigh ) ); \
+   A0 = v128_xor( A0, v128_32( Wlow ) ); \
+   A1 = v128_xor( A1, v128_32( Whigh ) ); \
 } while (0)
 
-#define mm128_swap256_128( v1, v2 ) \
-   v1 = _mm_xor_si128( v1, v2 ); \
-   v2 = _mm_xor_si128( v1, v2 ); \
-   v1 = _mm_xor_si128( v1, v2 );
+#define v128_swap256_128( v1, v2 ) \
+   v1 = v128_xor( v1, v2 ); \
+   v2 = v128_xor( v1, v2 ); \
+   v1 = v128_xor( v1, v2 );
 
 #define SWAP_BC \
 do { \
-    mm128_swap256_128( B0, C0 ); \
-    mm128_swap256_128( B1, C1 ); \
-    mm128_swap256_128( B2, C2 ); \
-    mm128_swap256_128( B3, C3 ); \
-    mm128_swap256_128( B4, C4 ); \
-    mm128_swap256_128( B5, C5 ); \
-    mm128_swap256_128( B6, C6 ); \
-    mm128_swap256_128( B7, C7 ); \
-    mm128_swap256_128( B8, C8 ); \
-    mm128_swap256_128( B9, C9 ); \
-    mm128_swap256_128( BA, CA ); \
-    mm128_swap256_128( BB, CB ); \
-    mm128_swap256_128( BC, CC ); \
-    mm128_swap256_128( BD, CD ); \
-    mm128_swap256_128( BE, CE ); \
-    mm128_swap256_128( BF, CF ); \
+    v128_swap256_128( B0, C0 ); \
+    v128_swap256_128( B1, C1 ); \
+    v128_swap256_128( B2, C2 ); \
+    v128_swap256_128( B3, C3 ); \
+    v128_swap256_128( B4, C4 ); \
+    v128_swap256_128( B5, C5 ); \
+    v128_swap256_128( B6, C6 ); \
+    v128_swap256_128( B7, C7 ); \
+    v128_swap256_128( B8, C8 ); \
+    v128_swap256_128( B9, C9 ); \
+    v128_swap256_128( BA, CA ); \
+    v128_swap256_128( BB, CB ); \
+    v128_swap256_128( BC, CC ); \
+    v128_swap256_128( BD, CD ); \
+    v128_swap256_128( BE, CE ); \
+    v128_swap256_128( BF, CF ); \
 } while (0)
 
 #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm128_xor3( xm, xb1, mm128_xorandnot( \
-           _mm_mullo_epi32( mm128_xor3( xa0, xc, \
-              _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+   xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
+           v128_mullo32( v128_xor3( xa0, xc, \
+              v128_mullo32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
            xb3, xb2 ) ); \
-   xb0 = mm128_xnor( xa0, mm128_rol_32( xb0, 1 ) ); \
+   xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
 } while (0)
 
-/*
-#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
-do { \
-   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
-            _mm_andnot_si128( xb3, xb2 ), \
-            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), FIVE ) \
-                   ) ), THREE ) ) ) ); \
-   xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
-} while (0)
-*/
-
 #define PERM_STEP_0   do { \
 		PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
@@ -1578,61 +1566,61 @@ do { \
 
 #define APPLY_P \
 do { \
-    B0 = mm128_ror_32( B0, 15 ); \
-    B1 = mm128_ror_32( B1, 15 ); \
-    B2 = mm128_ror_32( B2, 15 ); \
-    B3 = mm128_ror_32( B3, 15 ); \
-    B4 = mm128_ror_32( B4, 15 ); \
-    B5 = mm128_ror_32( B5, 15 ); \
-    B6 = mm128_ror_32( B6, 15 ); \
-    B7 = mm128_ror_32( B7, 15 ); \
-    B8 = mm128_ror_32( B8, 15 ); \
-    B9 = mm128_ror_32( B9, 15 ); \
-    BA = mm128_ror_32( BA, 15 ); \
-    BB = mm128_ror_32( BB, 15 ); \
-    BC = mm128_ror_32( BC, 15 ); \
-    BD = mm128_ror_32( BD, 15 ); \
-    BE = mm128_ror_32( BE, 15 ); \
-    BF = mm128_ror_32( BF, 15 ); \
+    B0 = v128_ror32( B0, 15 ); \
+    B1 = v128_ror32( B1, 15 ); \
+    B2 = v128_ror32( B2, 15 ); \
+    B3 = v128_ror32( B3, 15 ); \
+    B4 = v128_ror32( B4, 15 ); \
+    B5 = v128_ror32( B5, 15 ); \
+    B6 = v128_ror32( B6, 15 ); \
+    B7 = v128_ror32( B7, 15 ); \
+    B8 = v128_ror32( B8, 15 ); \
+    B9 = v128_ror32( B9, 15 ); \
+    BA = v128_ror32( BA, 15 ); \
+    BB = v128_ror32( BB, 15 ); \
+    BC = v128_ror32( BC, 15 ); \
+    BD = v128_ror32( BD, 15 ); \
+    BE = v128_ror32( BE, 15 ); \
+    BF = v128_ror32( BF, 15 ); \
     PERM_STEP_0; \
     PERM_STEP_1; \
     PERM_STEP_2; \
-    AB = _mm_add_epi32( AB, C6 ); \
-    AA = _mm_add_epi32( AA, C5 ); \
-    A9 = _mm_add_epi32( A9, C4 ); \
-    A8 = _mm_add_epi32( A8, C3 ); \
-    A7 = _mm_add_epi32( A7, C2 ); \
-    A6 = _mm_add_epi32( A6, C1 ); \
-    A5 = _mm_add_epi32( A5, C0 ); \
-    A4 = _mm_add_epi32( A4, CF ); \
-    A3 = _mm_add_epi32( A3, CE ); \
-    A2 = _mm_add_epi32( A2, CD ); \
-    A1 = _mm_add_epi32( A1, CC ); \
-    A0 = _mm_add_epi32( A0, CB ); \
-    AB = _mm_add_epi32( AB, CA ); \
-    AA = _mm_add_epi32( AA, C9 ); \
-    A9 = _mm_add_epi32( A9, C8 ); \
-    A8 = _mm_add_epi32( A8, C7 ); \
-    A7 = _mm_add_epi32( A7, C6 ); \
-    A6 = _mm_add_epi32( A6, C5 ); \
-    A5 = _mm_add_epi32( A5, C4 ); \
-    A4 = _mm_add_epi32( A4, C3 ); \
-    A3 = _mm_add_epi32( A3, C2 ); \
-    A2 = _mm_add_epi32( A2, C1 ); \
-    A1 = _mm_add_epi32( A1, C0 ); \
-    A0 = _mm_add_epi32( A0, CF ); \
-    AB = _mm_add_epi32( AB, CE ); \
-    AA = _mm_add_epi32( AA, CD ); \
-    A9 = _mm_add_epi32( A9, CC ); \
-    A8 = _mm_add_epi32( A8, CB ); \
-    A7 = _mm_add_epi32( A7, CA ); \
-    A6 = _mm_add_epi32( A6, C9 ); \
-    A5 = _mm_add_epi32( A5, C8 ); \
-    A4 = _mm_add_epi32( A4, C7 ); \
-    A3 = _mm_add_epi32( A3, C6 ); \
-    A2 = _mm_add_epi32( A2, C5 ); \
-    A1 = _mm_add_epi32( A1, C4 ); \
-    A0 = _mm_add_epi32( A0, C3 ); \
+    AB = v128_add32( AB, C6 ); \
+    AA = v128_add32( AA, C5 ); \
+    A9 = v128_add32( A9, C4 ); \
+    A8 = v128_add32( A8, C3 ); \
+    A7 = v128_add32( A7, C2 ); \
+    A6 = v128_add32( A6, C1 ); \
+    A5 = v128_add32( A5, C0 ); \
+    A4 = v128_add32( A4, CF ); \
+    A3 = v128_add32( A3, CE ); \
+    A2 = v128_add32( A2, CD ); \
+    A1 = v128_add32( A1, CC ); \
+    A0 = v128_add32( A0, CB ); \
+    AB = v128_add32( AB, CA ); \
+    AA = v128_add32( AA, C9 ); \
+    A9 = v128_add32( A9, C8 ); \
+    A8 = v128_add32( A8, C7 ); \
+    A7 = v128_add32( A7, C6 ); \
+    A6 = v128_add32( A6, C5 ); \
+    A5 = v128_add32( A5, C4 ); \
+    A4 = v128_add32( A4, C3 ); \
+    A3 = v128_add32( A3, C2 ); \
+    A2 = v128_add32( A2, C1 ); \
+    A1 = v128_add32( A1, C0 ); \
+    A0 = v128_add32( A0, CF ); \
+    AB = v128_add32( AB, CE ); \
+    AA = v128_add32( AA, CD ); \
+    A9 = v128_add32( A9, CC ); \
+    A8 = v128_add32( A8, CB ); \
+    A7 = v128_add32( A7, CA ); \
+    A6 = v128_add32( A6, C9 ); \
+    A5 = v128_add32( A5, C8 ); \
+    A4 = v128_add32( A4, C7 ); \
+    A3 = v128_add32( A3, C6 ); \
+    A2 = v128_add32( A2, C5 ); \
+    A1 = v128_add32( A1, C4 ); \
+    A0 = v128_add32( A0, C3 ); \
 } while (0)
 
 #define INCR_W   do { \
@@ -1798,8 +1786,8 @@ static void
 shabal_4way_core( void *cc, const unsigned char *data, size_t len )
 {
    shabal_4way_context *sc = (shabal_4way_context*)cc;
-    __m128i *buf;
-    __m128i *vdata = (__m128i*)data;
+    v128_t *buf;
+    v128_t *vdata = (v128_t*)data;
    const int buf_size = 64;  
    size_t ptr;
    DECL_STATE
@@ -1809,7 +1797,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
 
    if ( len < (buf_size - ptr ) )
    {
-      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      v128_memcpy( buf + (ptr>>2), vdata, len>>2 );
       ptr += len;
       sc->ptr = ptr;
       return;
@@ -1824,7 +1812,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
       clen = buf_size - ptr;
       if ( clen > len )
          clen = len;
-      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+      v128_memcpy( buf + (ptr>>2), vdata, clen>>2 );
 
       ptr += clen;
       vdata += clen>>2;
@@ -1850,7 +1838,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
                    unsigned size_words )
 {
    shabal_4way_context *sc = (shabal_4way_context*)cc;
-    __m128i *buf;
+    v128_t *buf;
    const int buf_size = 64;
    size_t ptr;
    int i;
@@ -1862,7 +1850,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
    z = 0x80 >> n;
    zz = ((ub & -z) | z) & 0xFF;
    buf[ptr>>2] = v128_32( zz );
-   memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   v128_memset_zero( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
    READ_STATE(sc);
    DECODE_BLOCK;
    INPUT_BLOCK_ADD;
@@ -1876,7 +1864,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
       APPLY_P;
    }
 
-   __m128i *d = (__m128i*)dst;
+   v128_t *d = (v128_t*)dst;
    if ( size_words == 16 )   // 512
    {
       d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h
index cd216f3..c431e53 100644
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -1,7 +1,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1
 
-#ifdef __SSE4_1__
+#if defined(__SSE4_1__) || defined(__ARM_NEON)
 
 #include <stddef.h>
 #include "simd-utils.h"
@@ -65,8 +65,8 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
 #endif
 
 typedef struct {
-	__m128i buf[16] __attribute__ ((aligned (64)));
-	__m128i A[12], B[16], C[16];
+	v128_t buf[16] __attribute__ ((aligned (64)));
+	v128_t A[12], B[16], C[16];
 	uint32_t Whigh, Wlow;
    size_t ptr;
    bool state_loaded;
diff --git a/algo/shavite/shavite-hash.h b/algo/shavite/shavite-hash.h
new file mode 100644
index 0000000..b645588
--- /dev/null
+++ b/algo/shavite/shavite-hash.h
@@ -0,0 +1,315 @@
+/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */
+/**
+ * SHAvite-3 interface. This code implements SHAvite-3 with the
+ * recommended parameters for SHA-3, with outputs of 224, 256, 384 and
+ * 512 bits. In the following, we call the function "SHAvite" (without
+ * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit
+ * output".
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shavite.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHAVITE_H__
+#define SPH_SHAVITE_H__
+
+#include <stddef.h>
+#include "compat/sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for SHAvite-224.
+ */
+#define SPH_SIZE_shavite224   224
+
+/**
+ * Output size (in bits) for SHAvite-256.
+ */
+#define SPH_SIZE_shavite256   256
+
+/**
+ * Output size (in bits) for SHAvite-384.
+ */
+#define SPH_SIZE_shavite384   384
+
+/**
+ * Output size (in bits) for SHAvite-512.
+ */
+#define SPH_SIZE_shavite512   512
+
+/**
+ * This structure is a context for SHAvite-224 and SHAvite-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64] __attribute__ ((aligned (64))); 
+        sph_u32 h[8] __attribute__ ((aligned (32)));
+	size_t ptr;
+	sph_u32 count0, count1;
+#endif
+} sph_shavite_small_context;
+
+/**
+ * This structure is a context for SHAvite-224 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite224_context;
+
+/**
+ * This structure is a context for SHAvite-256 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_small_context sph_shavite256_context;
+
+/**
+ * This structure is a context for SHAvite-384 and SHAvite-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a SHAvite computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running SHAvite
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128] __attribute__ ((aligned (64))); 
+        sph_u32 h[16] __attribute__ ((aligned (32)));;
+	size_t ptr;
+	sph_u32 count0, count1, count2, count3;
+#endif
+} sph_shavite_big_context;
+
+/**
+ * This structure is a context for SHAvite-384 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite384_context;
+
+/**
+ * This structure is a context for SHAvite-512 computations. It is
+ * identical to the common <code>sph_shavite_small_context</code>.
+ */
+typedef sph_shavite_big_context sph_shavite512_context;
+
+/**
+ * Initialize a SHAvite-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-224 context (pointer to a
+ *             <code>sph_shavite224_context</code>)
+ */
+void sph_shavite224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-256 context (pointer to a
+ *             <code>sph_shavite256_context</code>)
+ */
+void sph_shavite256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a SHAvite-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHAvite-384 context (pointer to a
+ *             <code>sph_shavite384_context</code>)
+ */
+void sph_shavite384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHAvite-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shavite384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHAvite-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHAvite-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shavite384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+//Don't call these directly from application code, use the macros below.
+#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
+
+void sph_shavite512_aesni_init(void *cc);
+void sph_shavite512_aesni(void *cc, const void *data, size_t len);
+void sph_shavite512_aesni_close(void *cc, void *dst);
+void sph_shavite512_aesni_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#define sph_shavite512_init  sph_shavite512_aesni_init
+#define sph_shavite512       sph_shavite512_aesni
+#define sph_shavite512_close sph_shavite512_aesni_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_aesni_addbits_and_close
+
+#else
+
+void sph_shavite512_sw_init(void *cc);
+void sph_shavite512_sw(void *cc, const void *data, size_t len);
+void sph_shavite512_sw_close(void *cc, void *dst);
+void sph_shavite512_sw_addbits_and_close(
+   void *cc, unsigned ub, unsigned n, void *dst);
+
+
+#define sph_shavite512_init  sph_shavite512_sw_init
+#define sph_shavite512       sph_shavite512_sw
+#define sph_shavite512_close sph_shavite512_sw_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_sw_addbits_and_close
+
+#endif
+
+// Use these macros from application code.
+#define shavite512_context sph_shavite512_context
+
+#define shavite512_init   sph_shavite512_init
+#define shavite512_update sph_shavite512
+#define shavite512_close  sph_shavite512_close
+
+#define shavite512_full( cc, dst, data, len ) \
+do{ \
+   shavite512_init( cc ); \
+   shavite512_update( cc, data, len ); \
+   shavite512_close( cc, dst ); \
+}while(0)
+
+#ifdef __cplusplus
+}
+#endif	
+	
+#endif
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index eaa6306..787a5c0 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -33,7 +33,9 @@
 #include <stddef.h>
 #include <string.h>
 
-#if defined(__AES__)
+#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
+
+#pragma message "AES for shavite"
 
 #include "sph_shavite.h"
 #include "simd-utils.h"
@@ -50,24 +52,21 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif
 
-#define C32   SPH_C32
-
 static const sph_u32 IV512[] = {
-	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
-	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
-	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
-	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+	0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
+	0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
+	0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
+	0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };
 
-
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
 {
-   const __m128i zero = _mm_setzero_si128();
-   __m128i p0, p1, p2, p3, x;
-   __m128i k00, k01, k02, k03, k10, k11, k12, k13;
-   __m128i *m = (__m128i*)msg;
-   __m128i *h = (__m128i*)sc->h;
+   const v128_t zero = v128_zero;
+   v128_t p0, p1, p2, p3, x;
+   v128_t k00, k01, k02, k03, k10, k11, k12, k13;
+   v128_t *m = (v128_t*)msg;
+   v128_t *h = (v128_t*)sc->h;
    int r;
 
    p0 = h[0];
@@ -78,242 +77,242 @@ c512( sph_shavite_big_context *sc, const void *msg )
    // round
 
    k00 = m[0];
-   x = _mm_xor_si128( p1, k00 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( p1, k00 );
+   x = v128_aesenc( x, zero );
 
    k01 = m[1];
-   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k01 );
+   x = v128_aesenc( x, zero );
    k02 = m[2];
-   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k02 );
+   x = v128_aesenc( x, zero );
    k03 = m[3];
-   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k03 );
+   x = v128_aesenc( x, zero );
 
-   p0 = _mm_xor_si128( p0, x );
+   p0 = v128_xor( p0, x );
 
    k10 = m[4];
-   x = _mm_xor_si128( p3, k10 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( p3, k10 );
+   x = v128_aesenc( x, zero );
    k11 = m[5];
-   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k11 );
+   x = v128_aesenc( x, zero );
    k12 = m[6];
-   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k12 );
+   x = v128_aesenc( x, zero );
    k13 = m[7];
-   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k13 );
+   x = v128_aesenc( x, zero );
 
-   p2 = _mm_xor_si128( p2, x );
+   p2 = v128_xor( p2, x );
 
    for ( r = 0; r < 3; r ++ )
    {
       // round 1, 5, 9
-      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
-      k00 = _mm_xor_si128( k00, k13 ); 
+      k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+      k00 = v128_xor( k00, k13 ); 
 
       if ( r == 0 )
-         k00 = _mm_xor_si128( k00, _mm_set_epi32(
+         k00 = v128_xor( k00, v128_set32(
                   ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
 
-      x = _mm_xor_si128( p0, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
-      k01 = _mm_xor_si128( k01, k00 );
+      x = v128_xor( p0, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
+      k01 = v128_xor( k01, k00 );
 
       if ( r == 1 )
-         k01 = _mm_xor_si128( k01, _mm_set_epi32(
+         k01 = v128_xor( k01, v128_set32(
                   ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
 
-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
-      k02 = _mm_xor_si128( k02, k01 );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
-      k03 = _mm_xor_si128( k03, k02 );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+      k02 = v128_xor( k02, k01 );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+      k03 = v128_xor( k03, k02 );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );
 
-      p3 = _mm_xor_si128( p3, x );
+      p3 = v128_xor( p3, x );
 
-      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
-      k10 = _mm_xor_si128( k10, k03 );
+      k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+      k10 = v128_xor( k10, k03 );
 
-      x = _mm_xor_si128( p2, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
-      k11 = _mm_xor_si128( k11, k10 );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
-      k12 = _mm_xor_si128( k12, k11 );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
-      k13 = _mm_xor_si128( k13, k12 );
+      x = v128_xor( p2, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+      k11 = v128_xor( k11, k10 );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+      k12 = v128_xor( k12, k11 );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+      k13 = v128_xor( k13, k12 );
 
       if ( r == 2 )
-         k13 = _mm_xor_si128( k13, _mm_set_epi32(
+         k13 = v128_xor( k13, v128_set32(
                   ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
 
-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
-      p1 = _mm_xor_si128( p1, x );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );
+      p1 = v128_xor( p1, x );
 
       // round 2, 6, 10
 
-      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
-      x = _mm_xor_si128( p3, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
+      x = v128_xor( p3, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );
 
-      p2 = _mm_xor_si128( p2, x );
+      p2 = v128_xor( p2, x );
 
-      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
-      x = _mm_xor_si128( p1, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
+      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
+      x = v128_xor( p1, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );
 
-      p0 = _mm_xor_si128( p0, x );
+      p0 = v128_xor( p0, x );
 
       // round 3, 7, 11
 
-      k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
-      k00 = _mm_xor_si128( k00, k13 );
-      x = _mm_xor_si128( p2, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
-      k01 = _mm_xor_si128( k01, k00 );
-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
-      k02 = _mm_xor_si128( k02, k01 );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
-      k03 = _mm_xor_si128( k03, k02 );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+      k00 = v128_xor( k00, k13 );
+      x = v128_xor( p2, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
+      k01 = v128_xor( k01, k00 );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+      k02 = v128_xor( k02, k01 );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+      k03 = v128_xor( k03, k02 );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );
 
-      p1 = _mm_xor_si128( p1, x );
+      p1 = v128_xor( p1, x );
 
-      k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
-      k10 = _mm_xor_si128( k10, k03 );
-      x = _mm_xor_si128( p0, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
-      k11 = _mm_xor_si128( k11, k10 );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
-      k12 = _mm_xor_si128( k12, k11 );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
-      k13 = _mm_xor_si128( k13, k12 );
-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
+      k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+      k10 = v128_xor( k10, k03 );
+      x = v128_xor( p0, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+      k11 = v128_xor( k11, k10 );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+      k12 = v128_xor( k12, k11 );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+      k13 = v128_xor( k13, k12 );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );
 
-      p3 = _mm_xor_si128( p3, x );
+      p3 = v128_xor( p3, x );
 
       // round 4, 8, 12
 
-      k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
-      x = _mm_xor_si128( p1, k00 );
-      x = _mm_aesenc_si128( x, zero );
-      k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
-      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, zero );
-      k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
-      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, zero );
-      k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
-      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, zero );
+      k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
+      x = v128_xor( p1, k00 );
+      x = v128_aesenc( x, zero );
+      k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
+      x = v128_xor( x, k01 );
+      x = v128_aesenc( x, zero );
+      k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
+      x = v128_xor( x, k02 );
+      x = v128_aesenc( x, zero );
+      k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
+      x = v128_xor( x, k03 );
+      x = v128_aesenc( x, zero );
 
-      p0 = _mm_xor_si128( p0, x );
+      p0 = v128_xor( p0, x );
 
-      k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
-      x = _mm_xor_si128( p3, k10 );
-      x = _mm_aesenc_si128( x, zero );
-      k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
-      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, zero );
-      k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
-      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, zero );
-      k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
-      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, zero );
+      k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
+      x = v128_xor( p3, k10 );
+      x = v128_aesenc( x, zero );
+      k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
+      x = v128_xor( x, k11 );
+      x = v128_aesenc( x, zero );
+      k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
+      x = v128_xor( x, k12 );
+      x = v128_aesenc( x, zero );
+      k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
+      x = v128_xor( x, k13 );
+      x = v128_aesenc( x, zero );
 
-      p2 = _mm_xor_si128( p2, x );
+      p2 = v128_xor( p2, x );
    }
 
    // round 13
 
-   k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
-   k00 = _mm_xor_si128( k00, k13 );
-   x = _mm_xor_si128( p0, k00 );
-   x = _mm_aesenc_si128( x, zero );
-   k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); 
-   k01 = _mm_xor_si128( k01, k00 );
-   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, zero );
-   k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
-   k02 = _mm_xor_si128( k02, k01 );
-   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, zero );
-   k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
-   k03 = _mm_xor_si128( k03, k02 );
-   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, zero );
+   k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
+   k00 = v128_xor( k00, k13 );
+   x = v128_xor( p0, k00 );
+   x = v128_aesenc( x, zero );
+   k01 = v128_shuflr32( v128_aesenc( k01, zero ) ); 
+   k01 = v128_xor( k01, k00 );
+   x = v128_xor( x, k01 );
+   x = v128_aesenc( x, zero );
+   k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
+   k02 = v128_xor( k02, k01 );
+   x = v128_xor( x, k02 );
+   x = v128_aesenc( x, zero );
+   k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
+   k03 = v128_xor( k03, k02 );
+   x = v128_xor( x, k03 );
+   x = v128_aesenc( x, zero );
 
-   p3 = _mm_xor_si128( p3, x );
+   p3 = v128_xor( p3, x );
 
-   k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
-   k10 = _mm_xor_si128( k10, k03 );
-   x = _mm_xor_si128( p2, k10 );
-   x = _mm_aesenc_si128( x, zero );
-   k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
-   k11 = _mm_xor_si128( k11, k10 );
-   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, zero );
-   k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
-   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
+   k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
+   k10 = v128_xor( k10, k03 );
+   x = v128_xor( p2, k10 );
+   x = v128_aesenc( x, zero );
+   k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
+   k11 = v128_xor( k11, k10 );
+   x = v128_xor( x, k11 );
+   x = v128_aesenc( x, zero );
+   k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
+   k12 = v128_xor( k12, v128_xor( k11, v128_set32(
                ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
-   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, zero );
-   k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
-   k13 = _mm_xor_si128( k13, k12 );
-   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, zero );
+   x = v128_xor( x, k12 );
+   x = v128_aesenc( x, zero );
+   k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
+   k13 = v128_xor( k13, k12 );
+   x = v128_xor( x, k13 );
+   x = v128_aesenc( x, zero );
 
-   p1 = _mm_xor_si128( p1, x );
+   p1 = v128_xor( p1, x );
 
-   h[0] = _mm_xor_si128( h[0], p2 );
-   h[1] = _mm_xor_si128( h[1], p3 );
-   h[2] = _mm_xor_si128( h[2], p0 );
-   h[3] = _mm_xor_si128( h[3], p1 );
+   h[0] = v128_xor( h[0], p2 );
+   h[1] = v128_xor( h[1], p3 );
+   h[2] = v128_xor( h[2], p0 );
+   h[3] = v128_xor( h[3], p1 );
 }
 
 
diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h
index c470e6d..b645588 100644
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
 //Don't call these directly from application code, use the macros below.
-#if defined(__AES__) && defined(__SSSE3__)
+#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
 
 void sph_shavite512_aesni_init(void *cc);
 void sph_shavite512_aesni(void *cc, const void *data, size_t len);
diff --git a/algo/simd/vector.c b/algo/simd/vector.c
index 60f0cc7..dc13d76 100644
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -4,6 +4,9 @@
 #include "nist.h"
 #include "vector.h"
 
+
+#if defined(__SSE2__)
+
 #define PRINT_SOME 0
 
 int SupportedLength(int hashbitlen) {
@@ -938,3 +941,5 @@ void fft128_natural(fft_t *x, unsigned char *a) {
     x[2*i+1] = y[i+64];
   }
 }
+
+#endif // SSE2
diff --git a/algo/simd/vector.h b/algo/simd/vector.h
index 6d686a0..756c7f5 100644
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -3,14 +3,10 @@
 
 #include "compat.h"
 
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-
 /******************************* 
  * Using GCC vector extensions * 
  *******************************/
 
-#if   defined(__SSE2__)
-
 //typedef unsigned char v16qi __attribute__ ((vector_size (16)));
 typedef char          v16qi __attribute__ ((vector_size (16)));
 typedef short          v8hi __attribute__ ((vector_size (16)));
@@ -65,6 +61,10 @@ union u32 {
 #define v32_andn(x,y) ((v32) vec_andn((x), (y)))
 #endif
 
+//TODO  aarch support for widening multiply
+
+#if defined(__SSE2__)
+
 #define vec_and(x,y) ((x)&(y))
 #define vec_or(x,y)  ((x)|(y))
 #define vec_xor(x,y) ((x)^(y))
@@ -127,72 +127,11 @@ union u32 {
 
 #define CV(x) {{x, x, x, x, x, x, x, x}}
 
-#elif defined(__ALTIVEC__)
-
-#include <altivec.h>
-
-typedef vector unsigned char  v8;
-typedef vector signed   short v16;
-typedef vector unsigned int   v32;
-
-#define V3216(x) ((v16) (x))
-#define V1632(x) ((v32) (x))
-#define  V168(x) ( (v8) (x))
-#define  V816(x) ((v16) (x))
-
-#define V16_SIZE 8
-#define print_vec print_sse
-
-#define MAKE_VECT(x, ...) {{x, __VA_ARGS__}}
-
-#define CV(x) MAKE_VECT(x, x, x, x, x, x, x, x)
-#define CV16(x)  ((vector   signed short) {x,x,x,x,x,x,x,x})
-#define CVU16(x) ((vector unsigned short) {x,x,x,x,x,x,x,x})
-#define CV32(x)  ((vector unsigned int  ) {x,x,x,x})
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-union ucv {
-  unsigned short u16[8];
-  vector unsigned char v16;
-};
-
-// Nasty hack to avoid macro expansion madness
-
-
-/* altivec.h is broken with Gcc 3.3 is C99 mode  */
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-#define typeof __typeof
-#endif
-
-MAYBE_INLINE v16 vec_and_fun (v16 x, v16 y) {
-  return vec_and (x, y);
-}
-
-MAYBE_INLINE v16 vec_or_fun (v16 x, v16 y) {
-  return vec_or (x, y);
-}
-
-MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
-  return vec_xor (x, y);
-}
-
-#undef vec_and
-#undef vec_or
-#undef vec_xor
-
-#define vec_and(x,y) ((__typeof(x)) vec_and_fun((v16) x, (v16) y))
-#define vec_or(x,y)  ((__typeof(x)) vec_or_fun((v16) x, (v16) y))
-#define vec_xor(x,y) ((__typeof(x)) vec_xor_fun((v16) x, (v16) y))
+#elif defined(__aarch64__) && defined(__ARM_NEON)
 
+#define vec_and( x, y )    v128_and( x, y )
+#define vec_or(x,y)        v128_or( x, y )
+#define vec_xor(x,y)       v128_xor( x, y )
 
 #define v16_and vec_and
 #define v16_or  vec_or
@@ -202,128 +141,36 @@ MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
 #define v32_or  vec_or
 #define v32_xor vec_xor
 
+#define vec_andn( x,y )   v128_andnot( x, y )
+#define v16_andn          vec_andn 
+#define v32_andn          vec_andn
 
-#define v32_add vec_add
+#define v32_add( x, y )   v128_add32( x, y )
 
-#define v16_add vec_add
-#define v16_sub vec_sub
-#define v16_mul(a,b) vec_mladd(a,b,CV16(0))
+#define v16_add( x, y )        v128_add16( x, y )
+#define v16_sub( x, y )        v128_sub16( x, y )
+#define v16_mul( x, y )        v128_mul16( x, y )
+#define v16_neg(x)             v128_negate16( x )
+#define v16_shift_l( x, c )    v128_sl16
+#define v16_shift_r            v128_sr16
+#define v16_cmp                v128_cmpgt16
 
-vector unsigned   short ZZ = {0,0,0,0,0,0,0,0};
+#define v16_interleavel        v128_unpacklo16
+#define v16_interleaveh        v128_unpackhi16 
 
-v16 v16_shift_l(v16 x,int s) {
-  vector unsigned short shift = {s,s,s,s,s,s,s,s};
-  v16 y = vec_sl (x, shift);
-  return y;
-}
-#define v16_shift_l(x,s)  vec_sl (x,CVU16(s))
-#define v16_shift_r(x,s)  vec_sra(x,CVU16(s))
-#define v16_cmp      vec_cmpgt
+// the builtins compile for arm, so ???
+#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
+#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
 
-#define v16_mergel(a,b)   V1632(vec_mergeh(b,a))
-#define v16_mergeh(a,b)   V1632(vec_mergel(b,a))
+#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
+#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
 
-#define v16_interleavel(a,b)   vec_mergeh(a,b)
-#define v16_interleaveh(a,b)   vec_mergel(a,b)
+#define v32_shift_l            v128_sl32
+#define v32_shift_r            v128_sr32
 
-#define v8_mergel(a,b) V816(vec_mergeh(b,a))
-#define v8_mergeh(a,b) V816(vec_mergel(b,a))
+#define v32_rotate(x,n)        v128_rol32
 
-#define v32_rotate(x,s)  vec_rl(x,CV32(s))
-
-// #define v32_unpckl   vec_mergel
-// #define v32_unpckh   vec_mergeh
-
-#define vector_shuffle(x,s) vec_perm(x,x,s)
-
-static const v8 SHUFXOR_1 = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
-static const v8 SHUFXOR_2 = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7};
-static const v8 SHUFXOR_3 = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
-
-#define v32_shufxor(x,s) vector_shuffle(x,SHUFXOR_##s)
-
-//static const v8 SHUFSWAP = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
-static const v8 SHUFSWAP = {3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12};
-
-#define v32_bswap(x) vector_shuffle(x,SHUFSWAP)
-
-#else
-
-#error "I don't know how to vectorize on this architecture."
-
-#endif
-
-#else
-
-/******************************** 
- * Using MSVC/ICC vector instrinsics * 
- ********************************/
-
-#include <emmintrin.h>
-
-typedef __m128i  v8;
-typedef __m128i v16;
-typedef __m128i v32;
-
-#define V3216(x) (x)
-#define V1632(x) (x)
-#define  V168(x) (x)
-#define  V816(x) (x)
-
-#define V16_SIZE 8
-
-union cv {
-  unsigned short u16[8];
-  v16 v16;
-};
-
-union cv8 {
-  unsigned char u8[16];
-  v8 v8;
-};
-
-#define CV(x) {{x, x, x, x, x, x, x, x}}
-
-#define vec_and      _mm_and_si128
-#define vec_or       _mm_or_si128
-#define vec_xor      _mm_xor_si128
-
-#define v16_and vec_and
-#define v16_or  vec_or
-#define v16_xor vec_xor
-
-#define v32_and vec_and
-#define v32_or  vec_or
-#define v32_xor vec_xor
-
-#define vector_shuffle(x,s) _mm_shuffle_epi8(x, s)
-
-#define v32_add      _mm_add_epi32
-
-#define v16_add      _mm_add_epi16
-#define v16_sub      _mm_sub_epi16
-#define v16_mul      _mm_mullo_epi16
-#define v16_neg(x)   (-(x))
-#define v16_shift_l  _mm_slli_epi16
-#define v16_shift_r  _mm_srai_epi16
-#define v16_cmp      _mm_cmpgt_epi16
-
-#define v16_interleavel   _mm_unpacklo_epi16
-#define v16_interleaveh   _mm_unpackhi_epi16
-
-#define v16_mergel   _mm_unpacklo_epi16
-#define v16_mergeh   _mm_unpackhi_epi16
-
-#define v8_mergel    _mm_unpacklo_epi8
-#define v8_mergeh    _mm_unpackhi_epi8
-
-#define v32_shift_l  _mm_slli_epi32
-#define v32_shift_r  _mm_srli_epi32
-
-#define v32_rotate(x,n)                                 \
-  vec_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
-
-#define v32_shuf     _mm_shuffle_epi32
+#define v32_shuf __builtin_ia32_pshufd
 
 #define SHUFXOR_1 0xb1          /* 0b10110001 */
 #define SHUFXOR_2 0x4e          /* 0b01001110 */
@@ -332,13 +179,25 @@ union cv8 {
 #define CAT(x, y) x##y
 #define XCAT(x,y) CAT(x,y)
 
-//#define v32_shufxor(x,s) v32_shuf(x,SHUFXOR_##s)
 #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
 
 #define v32_bswap(x) (x)
 
+#define v16_broadcast(x) ({                     \
+      union u32 u;                              \
+      u32 xx = x;                               \
+      u.u[0] = xx | (xx << 16);                 \
+      V3216(v32_shuf(u.v,0)); })
+
+#define CV(x) {{x, x, x, x, x, x, x, x}}
+
+#else
+
+#error "I don't know how to vectorize on this architecture."
+
 #endif
 
+
 /* Twiddle tables */
 
   static const union cv FFT64_Twiddle[] = {
diff --git a/algo/sm3/sm3-hash-4way.h b/algo/sm3/sm3-hash-4way.h
index abe1dfd..cfd061c 100644
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -65,8 +65,8 @@ extern "C" {
 #endif
 
 typedef struct {
-   __m128i block[16] __attribute__ ((aligned (64)));
-   __m128i digest[8];
+   v128_t block[16] __attribute__ ((aligned (64)));
+   v128_t digest[8];
    uint32_t nblocks;
    uint32_t num;
 } sm3_4way_ctx_t;
diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c
index d429bbc..09ce0dc 100644
--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -714,42 +714,42 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 
    #undef Q_REDUCE
 
-#elif defined(__SSE4_1__)
+#elif defined(__SSE4_1__) || defined(__ARM_NEON)
 
-   __m128i F[16] __attribute__ ((aligned (64)));
-   __m128i *mul = (__m128i*)multipliers;
-   __m128i *out = (__m128i*)output;
-   __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
+   v128_t F[16] __attribute__ ((aligned (64)));
+   v128_t *mul = (v128_t*)multipliers;
+   v128_t *out = (v128_t*)output;
+   v128_t *tbl = (v128_t*)&( fftTable[ input[0] << 3 ] );
 
-   F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
-   F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
-   F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
-   F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
-   F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
-   F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
-   F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
-   F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
-   F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
-   F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
-   F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
-   F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
-   F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
-   F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
-   tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
-   F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
-   F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
+   F[ 0] = v128_mullo32( mul[ 0], tbl[0] );
+   F[ 1] = v128_mullo32( mul[ 1], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[1] << 3 ] );
+   F[ 2] = v128_mullo32( mul[ 2], tbl[0] );
+   F[ 3] = v128_mullo32( mul[ 3], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[2] << 3 ] );
+   F[ 4] = v128_mullo32( mul[ 4], tbl[0] );
+   F[ 5] = v128_mullo32( mul[ 5], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[3] << 3 ] );
+   F[ 6] = v128_mullo32( mul[ 6], tbl[0] );
+   F[ 7] = v128_mullo32( mul[ 7], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[4] << 3 ] );
+   F[ 8] = v128_mullo32( mul[ 8], tbl[0] );
+   F[ 9] = v128_mullo32( mul[ 9], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[5] << 3 ] );
+   F[10] = v128_mullo32( mul[10], tbl[0] );
+   F[11] = v128_mullo32( mul[11], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[6] << 3 ] );
+   F[12] = v128_mullo32( mul[12], tbl[0] );
+   F[13] = v128_mullo32( mul[13], tbl[1] );
+   tbl = (v128_t*)&( fftTable[ input[7] << 3 ] );
+   F[14] = v128_mullo32( mul[14], tbl[0] );
+   F[15] = v128_mullo32( mul[15], tbl[1] );
 
    #define ADD_SUB( a, b ) \
    { \
-      __m128i tmp = b; \
-      b = _mm_sub_epi32( a, b ); \
-      a = _mm_add_epi32( a, tmp ); \
+      v128_t tmp = b; \
+      b = v128_sub32( a, b ); \
+      a = v128_add32( a, tmp ); \
    }
 
    ADD_SUB( F[ 0], F[ 2] );
@@ -760,10 +760,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
    ADD_SUB( F[ 9], F[11] );
    ADD_SUB( F[12], F[14] );
    ADD_SUB( F[13], F[15] );
-   F[ 6] = _mm_slli_epi32( F[ 6], 4 );
-   F[ 7] = _mm_slli_epi32( F[ 7], 4 );
-   F[14] = _mm_slli_epi32( F[14], 4 );
-   F[15] = _mm_slli_epi32( F[15], 4 );
+   F[ 6] = v128_sl32( F[ 6], 4 );
+   F[ 7] = v128_sl32( F[ 7], 4 );
+   F[14] = v128_sl32( F[14], 4 );
+   F[15] = v128_sl32( F[15], 4 );
    ADD_SUB( F[ 0], F[ 4] );
    ADD_SUB( F[ 1], F[ 5] );
    ADD_SUB( F[ 2], F[ 6] );
@@ -772,12 +772,12 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
    ADD_SUB( F[ 9], F[13] );
    ADD_SUB( F[10], F[14] );
    ADD_SUB( F[11], F[15] );
-   F[10] = _mm_slli_epi32( F[10], 2 );
-   F[11] = _mm_slli_epi32( F[11], 2 );
-   F[12] = _mm_slli_epi32( F[12], 4 );
-   F[13] = _mm_slli_epi32( F[13], 4 );
-   F[14] = _mm_slli_epi32( F[14], 6 );
-   F[15] = _mm_slli_epi32( F[15], 6 );
+   F[10] = v128_sl32( F[10], 2 );
+   F[11] = v128_sl32( F[11], 2 );
+   F[12] = v128_sl32( F[12], 4 );
+   F[13] = v128_sl32( F[13], 4 );
+   F[14] = v128_sl32( F[14], 6 );
+   F[15] = v128_sl32( F[15], 6 );
    ADD_SUB( F[ 0], F[ 8] );
    ADD_SUB( F[ 1], F[ 9] );
    ADD_SUB( F[ 2], F[10] );
@@ -789,10 +789,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 
    #undef ADD_SUB
 
-   const __m128i mask = _mm_set1_epi32( 0x000000ff );
+   const v128_t mask = v128_32( 0x000000ff );
 
    #define Q_REDUCE( a ) \
-      _mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) ) 
+      v128_sub32( v128_and( a, mask ), v128_sra32( a, 8 ) ) 
 
    out[ 0] = Q_REDUCE( F[ 0] );
    out[ 1] = Q_REDUCE( F[ 1] );
@@ -1261,14 +1261,14 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
 
 #elif defined(__SSE4_1__)
 
-   __m128i *res = (__m128i*)result;
+   v128_t *res = (v128_t*)result;
    for ( j = 0; j < N/4; ++j )
    {
-      __m128i sum = _mm_setzero_si128();
-      const __m128i *f = (__m128i*)input + j;
-      const __m128i *k = (__m128i*)a + j;
+      v128_t sum = v128_zero;
+      const v128_t *f = (v128_t*)input + j;
+      const v128_t *k = (v128_t*)a + j;
       for ( i = 0; i < m; i++, f += N/4, k += N/4 )
-         sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
+         sum = v128_add32( sum, v128_mullo32( *f, *k ) );
       res[j] = sum;
    }
 
diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c
index eeb2e5d..4da1467 100644
--- a/algo/verthash/verthash-gate.c
+++ b/algo/verthash/verthash-gate.c
@@ -101,7 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
    verthash_sha3_512_prehash_72( edata );
 
    do
diff --git a/algo/x11/c11.c b/algo/x11/c11.c
index d843b82..3bd856f 100644
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -12,9 +12,13 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 #if defined(__AES__)
   #include "algo/echo/aes_ni/hash_api.h"
@@ -37,7 +41,11 @@ typedef struct {
    sph_jh512_context       jh;
    sph_keccak512_context   keccak;
    sph_skein512_context    skein;
-   hashState_luffa         luffa;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
+        hashState_luffa         luffa;
+#endif
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
@@ -59,7 +67,11 @@ void init_c11_ctx()
    sph_skein512_init( &c11_ctx.skein );
    sph_jh512_init( &c11_ctx.jh );
    sph_keccak512_init( &c11_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init( &c11_ctx.luffa );
+#else
    init_luffa( &c11_ctx.luffa, 512 );
+#endif
    cubehashInit( &c11_ctx.cube, 512, 16, 32 );
    sph_shavite512_init( &c11_ctx.shavite );
    init_sd( &c11_ctx.simd, 512 );
@@ -94,8 +106,13 @@ void c11_hash( void *output, const void *input )
     sph_skein512( &ctx.skein, (const void*) hash, 64 );
     sph_skein512_close( &ctx.skein, hash );
 
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
      update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                              (const BitSequence*)hash, 64 );
+#endif
 
      cubehashUpdateDigest( &ctx.cube, (byte*)hash,
                            (const byte*)hash, 64 );
diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c
index 94e5ae8..17138d5 100644
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -144,17 +144,17 @@ void timetravel_4way_hash(void *output, const void *input)
         break;
         case 7:
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                      (const byte*)hash0, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash0,
+                                      hash0, dataLen );
            memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                      (const byte*)hash1, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash1,
+                                      hash1, dataLen );
            memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                      (const byte*)hash2, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash2,
+                                      hash2, dataLen );
            memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                      (const byte*)hash3, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash3,
+                                      hash3, dataLen );
            if ( i != 7 )           
               intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
         break;
diff --git a/algo/x11/timetravel.c b/algo/x11/timetravel.c
index c6a593c..b96f83c 100644
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -11,13 +11,17 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #ifdef __AES__
   #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
   #include "algo/groestl/sph_groestl.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
@@ -28,7 +32,11 @@ typedef struct {
         sph_skein512_context    skein;
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cube;
 #ifdef __AES__
         hashState_groestl       groestl;
@@ -47,7 +55,11 @@ void init_tt8_ctx()
         sph_skein512_init( &tt_ctx.skein );
         sph_jh512_init( &tt_ctx.jh );
         sph_keccak512_init( &tt_ctx.keccak );
+#if defined(__aarch64__)
+        sph_luffa512_init( &tt_ctx.luffa );
+#else
         init_luffa( &tt_ctx.luffa, 512 );
+#endif
         cubehashInit( &tt_ctx.cube, 512, 16, 32 );
 #ifdef __AES__
         init_groestl( &tt_ctx.groestl, 64 );
@@ -171,26 +183,37 @@ void timetravel_hash(void *output, const void *input)
      case 6:
         if ( i == 0 )
         {
+#if defined(__aarch64__)
            memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                   (const BitSequence *)input + 64, 16 );
+           sph_luffa512( &ctx.luffa, input + 64, 16 );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else
+           memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
+           update_and_final_luffa( &ctx.luffa, hashB,
+                                   input + 64, 16 );
+#endif
         }
         else
         {
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                   (const BitSequence *)hashA, dataLen );
+#if defined(__aarch64__)
+           sph_luffa512( &ctx.luffa, hashA, dataLen );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else
+           update_and_final_luffa( &ctx.luffa, hashB,
+                                   hashA, dataLen );
+#endif
         }
         break;
      case 7:
         if ( i == 0 )
         {
            memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hashB,
-                                 (const byte*)input + midlen, tail );
+           cubehashUpdateDigest( &ctx.cube, hashB,
+                                 input + midlen, tail );
         }
         else
         {
-           cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA,
+           cubehashUpdateDigest( &ctx.cube, hashB, hashA,
                                  dataLen );
         }
         break;
@@ -264,11 +287,15 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce,
            break;
         case 6:
            memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
-           update_luffa( &tt_mid.luffa, (const BitSequence*)endiandata, 64 );
+#if defined(__aarch64__)
+           sph_luffa512( &tt_mid.luffa, endiandata, 64 );
+#else
+           update_luffa( &tt_mid.luffa, endiandata, 64 );
+#endif
            break;
         case 7:
            memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) );
-           cubehashUpdate( &tt_mid.cube, (const byte*)endiandata, 64 );
+           cubehashUpdate( &tt_mid.cube, endiandata, 64 );
            break;
         default:
            break;
diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c
index 2271c68..90a0750 100644
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -151,17 +151,17 @@ void timetravel10_4way_hash(void *output, const void *input)
         case 7:
            dintrlv_4x64( hash0, hash1, hash2, hash3,
                                     vhashA, dataLen<<3 );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                      (const byte*)hash0, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash0,
+                                      hash0, dataLen );
            memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                      (const byte*)hash1, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash1,
+                                      hash1, dataLen );
            memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                      (const byte*)hash2, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash2,
+                                      hash2, dataLen );
            memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
-           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                      (const byte*)hash3, dataLen );
+           cubehashUpdateDigest( &ctx.cube, hash3,
+                                      hash3, dataLen );
            if ( i != 9 )           
               intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
         break;
diff --git a/algo/x11/timetravel10.c b/algo/x11/timetravel10.c
index 073ba55..2cd400f 100644
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -11,7 +11,6 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
@@ -20,6 +19,11 @@
 #else
   #include "algo/groestl/sph_groestl.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -30,7 +34,11 @@ typedef struct {
         sph_skein512_context    skein;
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cube;
         sph_shavite512_context  shavite;
         hashState_sd            simd;
@@ -51,7 +59,11 @@ void init_tt10_ctx()
         sph_skein512_init( &tt10_ctx.skein );
         sph_jh512_init( &tt10_ctx.jh );
         sph_keccak512_init( &tt10_ctx.keccak );
+#if defined(__aarch64__)
+        sph_luffa512_init( &tt10_ctx.luffa );
+#else
         init_luffa( &tt10_ctx.luffa, 512 );
+#endif
         cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
         sph_shavite512_init( &tt10_ctx.shavite );
         init_sd( &tt10_ctx.simd, 512 );
@@ -177,14 +189,25 @@ void timetravel10_hash(void *output, const void *input)
      case 6:
         if ( i == 0 )
         {
+#if defined(__aarch64__)
+           memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa );
+           sph_luffa512( &ctx.luffa, input + 64, 16 );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else           
            memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                                    (const BitSequence *)input + 64, 16 );
+#endif        
         }
         else
         {
+#if defined(__aarch64__)
+           sph_luffa512( &ctx.luffa, hashA, dataLen );
+           sph_luffa512_close( &ctx.luffa, hashB );
+#else
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                                    (const BitSequence *)hashA, dataLen );
+#endif
         }
         break;
      case 7:
@@ -297,7 +320,11 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
            break;
         case 6:
            memcpy( &tt10_mid.luffa, &tt10_ctx.luffa, sizeof(tt10_mid.luffa ) );
+#if defined(__aarch64__)
+           sph_luffa512( &tt10_mid.luffa, endiandata, 64 );
+#else
            update_luffa( &tt10_mid.luffa, (const BitSequence*)endiandata, 64 );
+#endif
            break;
         case 7:
            memcpy( &tt10_mid.cube, &tt10_ctx.cube, sizeof(tt10_mid.cube ) );
diff --git a/algo/x11/x11.c b/algo/x11/x11.c
index 48135d5..026630f 100644
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -13,7 +13,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 
@@ -24,6 +23,11 @@
   #include "algo/groestl/sph_groestl.h"
   #include "algo/echo/sph_echo.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
    sph_blake512_context blake;
@@ -38,7 +42,11 @@ typedef struct {
    sph_jh512_context       jh;
    sph_keccak512_context   keccak;
    sph_skein512_context    skein;
-   hashState_luffa         luffa;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
+        hashState_luffa         luffa;
+#endif
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
@@ -60,7 +68,11 @@ void init_x11_ctx()
    sph_skein512_init( &x11_ctx.skein );
    sph_jh512_init( &x11_ctx.jh );
    sph_keccak512_init( &x11_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init( &x11_ctx.luffa );
+#else
    init_luffa( &x11_ctx.luffa, 512 );
+#endif
    cubehashInit( &x11_ctx.cube, 512, 16, 32 );
    sph_shavite512_init( &x11_ctx.shavite );
    init_sd( &x11_ctx.simd, 512 );
@@ -97,8 +109,13 @@ void x11_hash( void *state, const void *input )
     sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
     sph_keccak512_close( &ctx.keccak, hash );
 
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
      update_luffa( &ctx.luffa, (const BitSequence*)hash, 64 );
      final_luffa( &ctx.luffa, (BitSequence*)hash );
+#endif
 
      cubehashUpdate( &ctx.cube, (const byte*) hash, 64 );
      cubehashDigest( &ctx.cube, (byte*)hash );
diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c
index 153390a..aa3873d 100644
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -19,9 +19,13 @@
   #include "algo/groestl/sph_groestl.h"
   #include "algo/echo/sph_echo.h"
 #endif
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
 #ifdef __AES__
@@ -31,7 +35,11 @@ typedef struct {
     sph_groestl512_context  groestl;
     sph_echo512_context     echo;
 #endif
-    hashState_luffa         luffa;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
+        hashState_luffa         luffa;
+#endif
     cubehashParam           cube;
     hashState_sd            simd;
     sph_blake512_context    blake;
@@ -53,7 +61,11 @@ void init_x11evo_ctx()
      sph_groestl512_init( &x11evo_ctx.groestl );
      sph_echo512_init( &x11evo_ctx.echo );
 #endif
+#if defined(__aarch64__)
+     sph_luffa512_init( &x11evo_ctx.luffa );
+#else
      init_luffa( &x11evo_ctx.luffa, 512 );
+#endif
      cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
      init_sd( &x11evo_ctx.simd, 512 );
      sph_blake512_init( &x11evo_ctx.blake );
@@ -124,9 +136,14 @@ void x11evo_hash( void *state, const void *input )
 	      sph_keccak512_close( &ctx.keccak, (char*)hash );
 	      break;
 	    case 6:
+#if defined(__aarch64__)
+              sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+              sph_luffa512_close(&ctx.luffa, hash);
+#else
               update_and_final_luffa( &ctx.luffa, (char*)hash,
                                       (const char*)hash, 64 );
-	      break;
+#endif
+         break;
 	    case 7:
               cubehashUpdateDigest( &ctx.cube, (char*)hash, 
                                     (const char*)hash, 64 );
diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c
index 30523fa..243d038 100644
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -13,7 +13,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 
@@ -24,6 +23,11 @@
   #include "algo/groestl/sph_groestl.h"
   #include "algo/echo/sph_echo.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
    sph_blake512_context blake;
@@ -38,7 +42,11 @@ typedef struct {
    sph_jh512_context       jh;
    sph_keccak512_context   keccak;
    sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context    luffa;
+#else
    hashState_luffa         luffa;
+#endif
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
@@ -63,7 +71,11 @@ void init_x11gost_ctx()
    sph_keccak512_init( &x11gost_ctx.keccak );
    sph_gost512_init( &x11gost_ctx.gost );
    sph_shavite512_init( &x11gost_ctx.shavite );
+#if defined(__aarch64__)
+   sph_luffa512_init(&x11gost_ctx.luffa );
+#else
    init_luffa( &x11gost_ctx.luffa, 512 );
+#endif
    cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
    init_sd( &x11gost_ctx.simd, 512 );
 }
@@ -102,8 +114,14 @@ void x11gost_hash(void *output, const void *input)
     sph_gost512( &ctx.gost, hash, 64 );
     sph_gost512_close( &ctx.gost, hash );
 
+#if defined(__aarch64__)
+    sph_luffa512_init(&ctx.luffa );
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                   (const BitSequence*)hash, 64 );
+#endif
 
     cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                 (const byte*)hash, 64 );
diff --git a/algo/x12/x12.c b/algo/x12/x12.c
index ca1a3ca..a478655 100644
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -16,13 +16,17 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #if defined(__AES__)
   #include "algo/groestl/aes_ni/hash-groestl.h"
   #include "algo/echo/aes_ni/hash_api.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
    sph_blake512_context    blake;
@@ -37,7 +41,11 @@ typedef struct {
    sph_groestl512_context   groestl;
    sph_echo512_context      echo;
 #endif
-   hashState_luffa          luffa;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
+   hashState_luffa         luffa;
+#endif
    cubehashParam            cubehash;
    sph_shavite512_context   shavite;
    hashState_sd             simd;
@@ -60,7 +68,11 @@ void init_x12_ctx()
         sph_groestl512_init(&x12_ctx.groestl);
         sph_echo512_init(&x12_ctx.echo);
 #endif
-        init_luffa( &x12_ctx.luffa, 512 );
+#if defined(__aarch64__)
+   sph_luffa512_init(&x12_ctx.luffa );
+#else
+   init_luffa( &x12_ctx.luffa, 512 );
+#endif
         cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
         sph_shavite512_init( &x12_ctx.shavite );
         init_sd( &x12_ctx.simd, 512 );
@@ -82,8 +94,13 @@ void x12hash(void *output, const void *input)
    sph_bmw512(&ctx.bmw, hash, 64);
    sph_bmw512_close(&ctx.bmw, hash);
 
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hashB);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                            (const BitSequence*)hash, 64 );
+#endif
 
    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                          (const byte*)hashB, 64 );
diff --git a/algo/x13/phi1612.c b/algo/x13/phi1612.c
index 33a17ee..cbde03e 100644
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -72,7 +72,7 @@ void phi1612_hash(void *output, const void *input)
      sph_jh512( &ctx.jh, (const void*)hash, 64 );
      sph_jh512_close( &ctx.jh, (void*)hash );
 
-     cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 );
+     cubehashUpdateDigest( &ctx.cube,  hash, hash, 64 );
 
 #if defined(__AES__)
      fugue512_Update( &ctx.fugue, hash, 512 ); 
diff --git a/algo/x13/skunk.c b/algo/x13/skunk.c
index 25549ff..d258710 100644
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -38,7 +38,7 @@ void skunkhash( void *output, const void *input )
      sph_skein512( &ctx.skein, input+64, 16 );
      sph_skein512_close( &ctx.skein, (void*) hash );
 
-     cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 );
+     cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
 
 #if defined(__AES__)
      fugue512_Update( &ctx.fugue, hash, 512 ); 
diff --git a/algo/x13/x13.c b/algo/x13/x13.c
index ca66e00..7c8ef22 100644
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -26,6 +26,11 @@
   #include "algo/echo/sph_echo.h"
   #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
    sph_blake512_context blake;
@@ -42,7 +47,11 @@ typedef struct {
    sph_jh512_context       jh;
    sph_keccak512_context   keccak;
    sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
    hashState_luffa         luffa;
+#endif
    cubehashParam           cubehash;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
@@ -67,7 +76,11 @@ void init_x13_ctx()
    sph_skein512_init( &x13_ctx.skein );
    sph_jh512_init( &x13_ctx.jh );
    sph_keccak512_init( &x13_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init(&x13_ctx.luffa );
+#else   
    init_luffa( &x13_ctx.luffa, 512 );
+#endif   
    cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
    sph_shavite512_init( &x13_ctx.shavite );
    init_sd( &x13_ctx.simd, 512 );
@@ -103,8 +116,13 @@ void x13hash(void *output, const void *input)
     sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
     sph_keccak512_close( &ctx.keccak, hash );
 
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                 (const BitSequence*)hash, 64 );
+#endif
 
     cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                               (const byte*)hash, 64 );
diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c
index 6025739..0f8136a 100644
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -143,7 +143,6 @@ void x13sm3_hash(void *output, const void *input)
         sph_fugue512(&ctx.fugue, hash, 64);
         sph_fugue512_close(&ctx.fugue, hash);
 
-        asm volatile ("emms");
 	memcpy(output, hash, 32);
 }
 
diff --git a/algo/x14/polytimos.c b/algo/x14/polytimos.c
index e81c479..b186133 100644
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -9,12 +9,16 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue//sph_fugue.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
 #ifdef __AES__
   #include "algo/echo/aes_ni/hash_api.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
 	sph_skein512_context    skein;
@@ -24,7 +28,11 @@ typedef struct {
 #else
 	sph_echo512_context		echo;
 #endif
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
    hashState_luffa         luffa;
+#endif
 	sph_fugue512_context    fugue;
 	sph_gost512_context     gost;
 } poly_ctx_holder;
@@ -40,7 +48,11 @@ void init_polytimos_ctx()
 #else
    sph_echo512_init(&poly_ctx.echo);
 #endif
+#if defined(__aarch64__)
+   sph_luffa512_init(&poly_ctx.luffa );
+#else
    init_luffa( &poly_ctx.luffa, 512 );
+#endif
    sph_fugue512_init(&poly_ctx.fugue);
    sph_gost512_init(&poly_ctx.gost);
 }
@@ -65,8 +77,13 @@ void polytimos_hash(void *output, const void *input)
 	sph_echo512_close(&ctx.echo, hashA);
 #endif
 
-        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
-                                (const BitSequence*)hashA, 64 );
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hashA, 64);
+    sph_luffa512_close(&ctx.luffa, hashA);
+#else
+    update_and_final_luffa( &ctx.luffa, hashA,
+                                hashA, 64 );
+#endif
 
 	sph_fugue512(&ctx.fugue, hashA, 64);
 	sph_fugue512_close(&ctx.fugue, hashA);
diff --git a/algo/x14/x14.c b/algo/x14/x14.c
index fdbcacb..a861a4f 100644
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -14,7 +14,6 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #if defined(__AES__)
@@ -26,6 +25,11 @@
   #include "algo/echo/sph_echo.h"
   #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
    sph_blake512_context blake;
@@ -42,7 +46,11 @@ typedef struct {
    sph_jh512_context       jh;
    sph_keccak512_context   keccak;
    sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
    hashState_luffa         luffa;
+#endif
    cubehashParam           cube;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
@@ -68,7 +76,11 @@ void init_x14_ctx()
    sph_skein512_init( &x14_ctx.skein );
    sph_jh512_init( &x14_ctx.jh );
    sph_keccak512_init( &x14_ctx.keccak );
+#if defined(__aarch64__)
+   sph_luffa512_init( &x14_ctx.luffa );
+#else
    init_luffa( &x14_ctx.luffa,512 );
+#endif
    cubehashInit( &x14_ctx.cube,512,16,32 );
    sph_shavite512_init( &x14_ctx.shavite );
    init_sd( &x14_ctx.simd,512 );
@@ -105,8 +117,13 @@ void x14hash(void *output, const void *input)
     sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
     sph_keccak512_close( &ctx.keccak, hash );
 
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                   (const BitSequence*)hash, 64 );
+#endif
 
     cubehashUpdateDigest( &ctx.cube, (byte*) hash,
                                 (const byte*)hash, 64 );
diff --git a/algo/x15/x15.c b/algo/x15/x15.c
index 73d64db..568e554 100644
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -16,7 +16,6 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 
@@ -29,6 +28,11 @@
   #include "algo/echo/sph_echo.h"
   #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
    sph_blake512_context blake;
@@ -45,7 +49,11 @@ typedef struct {
    sph_jh512_context       jh;
    sph_keccak512_context   keccak;
    sph_skein512_context    skein;
+#if defined(__aarch64__)
+   sph_luffa512_context       luffa;
+#else
    hashState_luffa         luffa;
+#endif
    cubehashParam           cubehash;
    sph_shavite512_context  shavite;
    hashState_sd            simd;
@@ -72,7 +80,11 @@ void init_x15_ctx()
    sph_skein512_init( &x15_ctx.skein );
    sph_jh512_init( &x15_ctx.jh );
    sph_keccak512_init( &x15_ctx.keccak );
-   init_luffa( &x15_ctx.luffa, 512 );
+#if defined(__aarch64__)
+   sph_luffa512_init( &x15_ctx.luffa );
+#else
+   init_luffa( &x15_ctx.luffa,512 );
+#endif   
    cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
    sph_shavite512_init( &x15_ctx.shavite );
    init_sd( &x15_ctx.simd, 512 );
@@ -112,8 +124,13 @@ void x15hash(void *output, const void *input)
     sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
     sph_keccak512_close( &ctx.keccak, hash );
    
+#if defined(__aarch64__)
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                 (const BitSequence*)hash, 64 );
+#endif
 
     cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                               (const byte*)hash, 64 );
diff --git a/algo/x16/hex.c b/algo/x16/hex.c
index b707ed1..f68bdc4 100644
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -86,13 +86,26 @@ int hex_hash( void* output, const void* input, int thrid )
          break;
          case LUFFA:
             if ( i == 0 )
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+            {
+#if defined(__aarch64__)
+              sph_luffa512(&ctx.luffa, (const void*) in+64, 16 );
+              sph_luffa512_close(&ctx.luffa, hash);
+#else
+              update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                           (const BitSequence*)in+64, 16 );
+#endif
+            }
             else
             {
+#if defined(__aarch64__)
+              sph_luffa512_init(&ctx.luffa );
+              sph_luffa512(&ctx.luffa, (const void*) in, size );
+              sph_luffa512_close(&ctx.luffa, hash);
+#else
                init_luffa( &ctx.luffa, 512 );
                update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                              (const BitSequence*)in, size );
+#endif
             }
             break;
          case CUBEHASH:
@@ -192,7 +205,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    if ( bench )  ptarget[7] = 0x0cff;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
    
    static __thread uint32_t s_ntime = UINT32_MAX;
    uint32_t ntime = swab32(pdata[17]);
@@ -218,8 +231,13 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
          sph_skein512( &hex_ctx.skein, edata, 64 );
       break;
       case LUFFA:
+#if defined(__aarch64__)
+         sph_luffa512_init(&hex_ctx.luffa );
+         sph_luffa512(&hex_ctx.luffa, (const void*) edata, 64);
+#else
          init_luffa( &hex_ctx.luffa, 512 );
          update_luffa( &hex_ctx.luffa, (const BitSequence*)edata, 64 );
+#endif
       break;
       case CUBEHASH:
          cubehashInit( &hex_ctx.cube, 512, 16, 32 );
diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c
index cecc408..d62b514 100644
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -11,7 +11,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/hamsi/sph_hamsi.h"
@@ -28,6 +27,11 @@
   #include "algo/groestl/sph_groestl.h"
   #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 // Config
 #define MINOTAUR_ALGO_COUNT	16
@@ -55,7 +59,11 @@ struct TortureGarden
         sph_skein512_context    skein;
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cube;
         shavite512_context      shavite;
         hashState_sd            simd;
@@ -141,9 +149,15 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
             sph_keccak512_close(&garden->keccak, hash);
             break;
         case 10:
+#if defined(__aarch64__)
+            sph_luffa512_init(&garden->luffa );
+            sph_luffa512(&garden->luffa, (const void*) input, 64);
+            sph_luffa512_close(&garden->luffa, hash);
+#else
             init_luffa( &garden->luffa, 512 );
             update_and_final_luffa( &garden->luffa, (BitSequence*)hash,
                                     (const BitSequence*)input, 64 );
+#endif
             break;
         case 11:
             sph_shabal512_init(&garden->shabal);
@@ -287,7 +301,7 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    uint64_t skipped = 0;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
    do
    {
       edata[19] = n;
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index 3357db8..b16970b 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -47,7 +47,7 @@ void x16r_8way_prehash( void *vdata, void *pdata )
       case LUFFA:
       {
          hashState_luffa ctx_luffa;
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          intrlv_8x64( vdata, edata, edata, edata, edata,
                              edata, edata, edata, edata, 640 );            
          init_luffa( &ctx_luffa, 512 );
@@ -63,7 +63,7 @@ void x16r_8way_prehash( void *vdata, void *pdata )
       case CUBEHASH:
       {
          cubehashParam ctx_cube;
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          intrlv_8x64( vdata, edata, edata, edata, edata,
                              edata, edata, edata, edata, 640 );            
          cubehashInit( &ctx_cube, 512, 16, 32 );
@@ -82,7 +82,7 @@ void x16r_8way_prehash( void *vdata, void *pdata )
          hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 );
       break;
       case FUGUE:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          fugue512_init( &x16r_ctx.fugue );
          fugue512_update( &x16r_ctx.fugue, edata, 76 );
          intrlv_8x64( vdata, edata, edata, edata, edata,
@@ -95,7 +95,7 @@ void x16r_8way_prehash( void *vdata, void *pdata )
          rintrlv_8x32_8x64( vdata, vdata2, 640 );
       break;
       case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          sph_whirlpool_init( &x16r_ctx.whirlpool );
          sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
          intrlv_8x64( vdata, edata, edata, edata, edata,
@@ -573,7 +573,7 @@ void x16r_4way_prehash( void *vdata, void *pdata )
       case LUFFA:
       {
          hashState_luffa ctx_luffa;
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
          init_luffa( &ctx_luffa, 512 );
          update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 );
@@ -588,7 +588,7 @@ void x16r_4way_prehash( void *vdata, void *pdata )
       case CUBEHASH:
       {
          cubehashParam ctx_cube;
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
          cubehashInit( &ctx_cube, 512, 16, 32 );
          cubehashUpdate( &ctx_cube, (const byte*)edata, 64 );
@@ -605,19 +605,19 @@ void x16r_4way_prehash( void *vdata, void *pdata )
          hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 );
       break;
       case FUGUE:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          fugue512_init( &x16r_ctx.fugue );
          fugue512_update( &x16r_ctx.fugue, edata, 76 );
          intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
       break;
       case SHABAL:
-         mm128_bswap32_intrlv80_4x32( vdata2, pdata );
+         v128_bswap32_intrlv80_4x32( vdata2, pdata );
          shabal512_4way_init( &x16r_ctx.shabal );
          shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 );
          rintrlv_4x32_4x64( vdata, vdata2, 640 );
       break;
       case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          sph_whirlpool_init( &x16r_ctx.whirlpool );
          sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
          intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index 3c82b46..90354e2 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -12,7 +12,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
@@ -23,33 +22,37 @@
 #include "algo/sha/sha512-hash.h"
 
 #if defined(__AES__)
-#include "algo/echo/aes_ni/hash_api.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/fugue/fugue-aesni.h"
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/fugue/fugue-aesni.h"
 #endif
 
 #if defined (__AVX2__)
-
-#include "algo/bmw/bmw-hash-4way.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/skein/skein-hash-4way.h"
-#include "algo/jh/jh-hash-4way.h"
-#include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/cube-hash-2way.h"
-#include "algo/simd/simd-hash-2way.h"
-#include "algo/echo/aes_ni/hash_api.h"
-#include "algo/hamsi/hamsi-hash-4way.h"
-#include "algo/shabal/shabal-hash-4way.h"
-
-#if defined(__VAES__)
-#include "algo/groestl/groestl512-hash-4way.h"
-#include "algo/shavite/shavite-hash-2way.h"
-#include "algo/shavite/shavite-hash-4way.h"
-#include "algo/echo/echo-hash-4way.h"
+  #include "algo/bmw/bmw-hash-4way.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+  #include "algo/skein/skein-hash-4way.h"
+  #include "algo/jh/jh-hash-4way.h"
+  #include "algo/keccak/keccak-hash-4way.h"
+  #include "algo/luffa/luffa-hash-2way.h"
+  #include "algo/cubehash/cube-hash-2way.h"
+  #include "algo/simd/simd-hash-2way.h"
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/hamsi/hamsi-hash-4way.h"
+  #include "algo/shabal/shabal-hash-4way.h"
 #endif
 
-#endif // AVX2
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-2way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
+
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
@@ -203,7 +206,11 @@ union _x16r_context_overlay
         sph_skein512_context    skein;
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cube;
         shavite512_context      shavite;
         hashState_sd            simd;
diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c
index b04fd3e..5a2b7e3 100644
--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -26,8 +26,14 @@ void x16r_prehash( void *edata, void *pdata )
          sph_skein512( &x16_ctx.skein, edata, 64 );
       break;
       case LUFFA:
+#if defined(__aarch64__)
+         sph_luffa512_init( &x16_ctx.luffa );
+         sph_luffa512( &x16_ctx.luffa, edata, 64 );
+
+#else
          init_luffa( &x16_ctx.luffa, 512 );
          update_luffa( &x16_ctx.luffa, (const BitSequence*)edata, 64 );
+#endif
       break;
       case CUBEHASH:
          cubehashInit( &x16_ctx.cube, 512, 16, 32 );
@@ -108,13 +114,24 @@ int x16r_hash_generic( void* output, const void* input, int thrid )
             sph_skein512_close( &ctx.skein, hash );
          break;
          case LUFFA:
+#if defined(__aarch64__)
+            if ( i == 0 )
+               sph_luffa512(&ctx.luffa, in+64, 16 );
+            else
+            {
+               sph_luffa512_init( &ctx.luffa );
+               sph_luffa512( &ctx.luffa, in, size );
+            }
+            sph_luffa512_close( &ctx.luffa, hash );
+#else         
             if ( i == 0 )
                update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                              (const BitSequence*)in+64, 16 );
             else
                luffa_full( &ctx.luffa, (BitSequence*)hash, 512,
                                  (const BitSequence*)in, size );
-         break;
+#endif
+            break;
          case CUBEHASH:
             if ( i == 0 )
                cubehashUpdateDigest( &ctx.cube, (byte*)hash,
@@ -216,7 +233,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    if ( bench )  ptarget[7] = 0x0cff;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
 
    static __thread uint32_t s_ntime = UINT32_MAX;
    uint32_t ntime = bswap_32( pdata[17] );
diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c
index 0d2b663..954aca3 100644
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -17,7 +17,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    if ( bench )  ptarget[7] = 0x0cff;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
 
    static __thread uint32_t s_ntime = UINT32_MAX;
    uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80;
diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c
index b94c4a3..2f9c112 100644
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -605,7 +605,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
       case KECCAK:
       case LUFFA:
       case SHA_512:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          sph_tiger_init( &x16rv2_ctx.tiger );
          sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
          intrlv_8x64( vdata, edata, edata, edata, edata,
@@ -617,7 +617,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
          skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 );
       break;
       case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
          cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
          intrlv_8x64( vdata, edata, edata, edata, edata,
@@ -635,7 +635,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
          rintrlv_8x32_8x64( vdata, vdata2, 640 );
       break;
       case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          sph_whirlpool_init( &x16rv2_ctx.whirlpool );
          sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
          intrlv_8x64( vdata, edata, edata, edata, edata,
@@ -1094,7 +1094,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
       case KECCAK:
       case LUFFA:
       case SHA_512:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          sph_tiger_init( &x16rv2_ctx.tiger );
          sph_tiger( &x16rv2_ctx.tiger, edata, 64 );
          intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
@@ -1104,7 +1104,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
          skein512_4way_prehash64( &x16r_ctx.skein, vdata );
       break;
       case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 );
          cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 );
          intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
@@ -1115,13 +1115,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
          hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 );
       break;
       case SHABAL:
-         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
+         v128_bswap32_intrlv80_4x32( vdata32, pdata );
          shabal512_4way_init( &x16rv2_ctx.shabal );
          shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 );
          rintrlv_4x32_4x64( vdata, vdata32, 640 );
       break;
       case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
+         v128_bswap32_80( edata, pdata );
          sph_whirlpool_init( &x16rv2_ctx.whirlpool );
          sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 );
          intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c
index b6f8d19..51ace2e 100644
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -26,7 +26,11 @@ union _x16rv2_context_overlay
         sph_skein512_context    skein;
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cube;
         shavite512_context      shavite;
         hashState_sd            simd;
@@ -102,9 +106,15 @@ int x16rv2_hash( void* output, const void* input, int thrid )
             sph_tiger( &ctx.tiger, in, size );
             sph_tiger_close( &ctx.tiger, hash );
             padtiger512( hash );
+#if defined(__aarch64__)
+            sph_luffa512_init(&ctx.luffa );
+            sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+            sph_luffa512_close(&ctx.luffa, hash);
+#else
             init_luffa( &ctx.luffa, 512 );
             update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                     (const BitSequence*)hash, 64 );
+#endif
          break;
          case CUBEHASH:
             cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -183,11 +193,11 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
    volatile uint8_t *restart = &(work_restart[thr_id].restart);
    const bool bench = opt_benchmark;
 
-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_v128( edata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) );
+   casti_v128( edata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) );
+   casti_v128( edata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) );
+   casti_v128( edata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) );
+   casti_v128( edata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
 
    static __thread uint32_t s_ntime = UINT32_MAX;
    if ( s_ntime != pdata[17] )
diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c
index 96782e2..7e6ef19 100644
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -8,6 +8,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <mm_malloc.h>
 #include "algo/sha/sha256-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
@@ -71,7 +72,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
    const bool bench = opt_benchmark;
    if ( bench )  ptarget[7] = 0x0cff;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
 
    static __thread uint32_t s_ntime = UINT32_MAX;
    if ( s_ntime != pdata[17] )
diff --git a/algo/x17/sonoa.c b/algo/x17/sonoa.c
index d9fede2..5910e53 100644
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -17,7 +17,6 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/sha/sph_sha2.h"
@@ -30,6 +29,11 @@
   #include "algo/echo/sph_echo.h"
   #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 typedef struct {
         sph_blake512_context    blake;
@@ -46,7 +50,11 @@ typedef struct {
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
         sph_skein512_context    skein;
+#if defined(__aarch64__)
+        sph_luffa512_context    luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cubehash;
         sph_shavite512_context  shavite;
         hashState_sd            simd;
@@ -75,7 +83,11 @@ void init_sonoa_ctx()
         sph_skein512_init( &sonoa_ctx.skein);
         sph_jh512_init( &sonoa_ctx.jh);
         sph_keccak512_init( &sonoa_ctx.keccak );
+#if defined(__aarch64__)
+        sph_luffa512_init(&sonoa_ctx.luffa);
+#else
         init_luffa( &sonoa_ctx.luffa, 512 );
+#endif
         cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
         sph_shavite512_init( &sonoa_ctx.shavite );
         init_sd( &sonoa_ctx.simd, 512 );
@@ -115,6 +127,10 @@ int sonoa_hash( void *state, const void *input, int thr_id )
 	sph_keccak512(&ctx.keccak, hash, 64);
 	sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, 64 );
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, 64 );
 
@@ -126,6 +142,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
 
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                          (const BitSequence *)hash, 512 );
+#endif
 
 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
@@ -164,9 +181,14 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, 64);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, 64 );
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, 64 );
+#endif
 
    cubehashInit( &ctx.cubehash, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
@@ -222,9 +244,14 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, 64);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, 64 );
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, 64 );
+#endif
 
    cubehashInit( &ctx.cubehash, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
@@ -289,9 +316,14 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, 64);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, 64 );
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                 (const BitSequence*)hash, 64 );
+#endif
 
    cubehashInit( &ctx.cubehash, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
@@ -379,9 +411,14 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, 64);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, 64 );
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                 (const BitSequence*)hash, 64 );
+#endif
 
    cubehashInit( &ctx.cubehash, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
@@ -450,9 +487,14 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, 64);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, 64 );
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                 (const BitSequence*)hash, 64 );
+#endif
 
    cubehashInit( &ctx.cubehash, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
@@ -530,9 +572,14 @@ int sonoa_hash( void *state, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, 64);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, 64 );
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                 (const BitSequence*)hash, 64 );
+#endif
 
    cubehashInit( &ctx.cubehash, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
diff --git a/algo/x17/x17.c b/algo/x17/x17.c
index 8b5cf6d..2bd7875 100644
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -11,12 +11,12 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
+ #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
-#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/sha/sph_sha2.h"
diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c
index 470add1..479f77a 100644
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -13,7 +13,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -30,6 +29,12 @@
   #include "algo/echo/sph_echo.h"
   #include "algo/fugue/sph_fugue.h"
 #endif
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
+
 
 typedef struct {
         sph_blake512_context    blake;
@@ -37,7 +42,11 @@ typedef struct {
         sph_skein512_context    skein;
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
+#if defined(__aarch64__)
+        sph_luffa512_context    luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cubehash;
         sph_shavite512_context  shavite;
         hashState_sd            simd;
@@ -66,7 +75,11 @@ void init_xevan_ctx()
         sph_skein512_init(&xevan_ctx.skein);
         sph_jh512_init(&xevan_ctx.jh);
         sph_keccak512_init(&xevan_ctx.keccak);
+#if defined(__aarch64__)
+        sph_luffa512_init(&xevan_ctx.luffa);
+#else
         init_luffa( &xevan_ctx.luffa, 512 );
+#endif
         cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
         sph_shavite512_init( &xevan_ctx.shavite );
         init_sd( &xevan_ctx.simd, 512 );
@@ -80,7 +93,7 @@ void init_xevan_ctx()
         init_echo( &xevan_ctx.echo, 512 );
         fugue512_Init( &xevan_ctx.fugue, 512 );
 #else
-	sph_groestl512_init( &xevan_ctx.groestl );
+        sph_groestl512_init( &xevan_ctx.groestl );
         sph_echo512_init( &xevan_ctx.echo );
         sph_fugue512_init( &xevan_ctx.fugue );
 #endif
@@ -117,8 +130,13 @@ int xevan_hash(void *output, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, dataLen);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, dataLen);
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, dataLen );
+#endif
 
    cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
                                  (const byte*) hash, dataLen );
@@ -187,8 +205,13 @@ int xevan_hash(void *output, const void *input, int thr_id )
    sph_keccak512(&ctx.keccak, hash, dataLen);
    sph_keccak512_close(&ctx.keccak, hash);
 
+#if defined(__aarch64__)
+   sph_luffa512(&ctx.luffa, hash, dataLen);
+   sph_luffa512_close(&ctx.luffa, hash);
+#else
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                  (const BitSequence*)hash, dataLen );
+#endif
 
    cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
                                  (const byte*) hash, dataLen );
diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c
index dbb763a..c515dfc 100644
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -16,7 +16,6 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
@@ -30,6 +29,11 @@
 #include "algo/lyra2/lyra2.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/swifftx/swifftx.h"
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 union _x22i_context_overlay
 {
@@ -47,7 +51,11 @@ union _x22i_context_overlay
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
         sph_skein512_context    skein;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cube;
         sph_shavite512_context  shavite;
         hashState_sd            simd;
@@ -99,9 +107,15 @@ int x22i_hash( void *output, const void *input, int thrid )
 
    if ( work_restart[thrid].restart ) return 0;
    
+#if defined(__aarch64__)
+    sph_luffa512_init(&ctx.luffa );
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
                                 (const BitSequence*)hash, 64 );
+#endif
 
    cubehashInit( &ctx.cube, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cube, (byte*) hash,
@@ -193,7 +207,7 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce,
 
    if ( bench ) ptarget[7] = 0x08ff;
    
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
 
    InitializeSWIFFTX();
    
diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c
index 370abaa..1c8237b 100644
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -16,7 +16,6 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
-#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
@@ -24,6 +23,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha256-hash.h"
+#include "algo/sha/sha512-hash.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/lyra2/lyra2.h"
@@ -32,6 +32,11 @@
 #include "algo/blake/sph-blake2s.h"
 #include "algo/panama/sph_panama.h"
 #include "algo/lanehash/lane.h"
+#if defined(__aarch64__)
+  #include "algo/luffa/sph_luffa.h"
+#else
+  #include "algo/luffa/luffa_for_sse2.h"
+#endif
 
 union _x25x_context_overlay
 {
@@ -49,7 +54,11 @@ union _x25x_context_overlay
         sph_jh512_context       jh;
         sph_keccak512_context   keccak;
         sph_skein512_context    skein;
+#if defined(__aarch64__)
+        sph_luffa512_context       luffa;
+#else
         hashState_luffa         luffa;
+#endif
         cubehashParam           cube;
         sph_shavite512_context  shavite;
         hashState_sd            simd;
@@ -103,9 +112,15 @@ int x25x_hash( void *output, const void *input, int thrid )
 
    if ( work_restart[thrid].restart ) return 0;
    
+#if defined(__aarch64__)
+    sph_luffa512_init(&ctx.luffa );
+    sph_luffa512(&ctx.luffa, (const void*) hash, 64);
+    sph_luffa512_close(&ctx.luffa, hash);
+#else
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash[6],
                                 (const BitSequence*)&hash[5], 64 );
+#endif
 
    cubehashInit( &ctx.cube, 512, 16, 32 );
    cubehashUpdateDigest( &ctx.cube, (byte*) &hash[7],
@@ -227,7 +242,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
 
    if ( bench ) ptarget[7] = 0x08ff;
 
-   mm128_bswap32_80( edata, pdata );
+   v128_bswap32_80( edata, pdata );
 
    InitializeSWIFFTX();
 
diff --git a/algo/yespower/yespower-blake2b-ref.c b/algo/yespower/yespower-blake2b-ref.c
new file mode 100644
index 0000000..42bb9ab
--- /dev/null
+++ b/algo/yespower/yespower-blake2b-ref.c
@@ -0,0 +1,593 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013-2018 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ *
+ * This is a proof-of-work focused fork of yescrypt, including reference and
+ * cut-down implementation of the obsolete yescrypt 0.5 (based off its first
+ * submission to PHC back in 2014) and a new proof-of-work specific variation
+ * known as yespower 1.0.  The former is intended as an upgrade for
+ * cryptocurrencies that already use yescrypt 0.5 and the latter may be used
+ * as a further upgrade (hard fork) by those and other cryptocurrencies.  The
+ * version of algorithm to use is requested through parameters, allowing for
+ * both algorithms to co-exist in client and miner implementations (such as in
+ * preparation for a hard-fork).
+ *
+ * This is the reference implementation.  Its purpose is to provide a simple
+ * human- and machine-readable specification that implementations intended
+ * for actual use should be tested against.  It is deliberately mostly not
+ * optimized, and it is not meant to be used in production.  Instead, use
+ * yespower-opt.c.
+ */
+/*
+#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose."
+*/
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "crypto/hmac-blake2b.h"
+//#include "sysendian.h"
+
+#include "yespower.h"
+
+static void blkcpy(uint32_t *dst, const uint32_t *src, size_t count)
+{
+	do {
+		*dst++ = *src++;
+	} while (--count);
+}
+
+static void blkxor(uint32_t *dst, const uint32_t *src, size_t count)
+{
+	do {
+		*dst++ ^= *src++;
+	} while (--count);
+}
+
+/**
+ * salsa20(B):
+ * Apply the Salsa20 core to the provided block.
+ */
+static void salsa20(uint32_t B[16], uint32_t rounds)
+{
+	uint32_t x[16];
+	size_t i;
+
+	/* SIMD unshuffle */
+	for (i = 0; i < 16; i++)
+		x[i * 5 % 16] = B[i];
+
+	for (i = 0; i < rounds; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	}
+
+	/* SIMD shuffle */
+	for (i = 0; i < 16; i++)
+		B[i] += x[i * 5 % 16];
+}
+
+/**
+ * blockmix_salsa(B):
+ * Compute B = BlockMix_{salsa20, 1}(B).  The input B must be 128 bytes in
+ * length.
+ */
+static void blockmix_salsa(uint32_t *B, uint32_t rounds)
+{
+	uint32_t X[16];
+	size_t i;
+
+	/* 1: X <-- B_{2r - 1} */
+	blkcpy(X, &B[16], 16);
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < 2; i++) {
+		/* 3: X <-- H(X xor B_i) */
+		blkxor(X, &B[i * 16], 16);
+		salsa20(X, rounds);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&B[i * 16], X, 16);
+	}
+}
+
+/*
+ * These are tunable, but they must meet certain constraints and are part of
+ * what defines a yespower version.
+ */
+#define PWXsimple 2
+#define PWXgather 4
+/* Version 0.5 */
+#define PWXrounds_0_5 6
+#define Swidth_0_5 8
+/* Version 1.0 */
+#define PWXrounds_1_0 3
+#define Swidth_1_0 11
+
+/* Derived values.  Not tunable on their own. */
+#define PWXbytes (PWXgather * PWXsimple * 8)
+#define PWXwords (PWXbytes / sizeof(uint32_t))
+#define rmin ((PWXbytes + 127) / 128)
+
+/* Runtime derived values.  Not tunable on their own. */
+#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8)
+#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8)
+
+typedef struct {
+	yespower_version_t version;
+	uint32_t salsa20_rounds;
+	uint32_t PWXrounds, Swidth, Sbytes, Smask;
+	uint32_t *S;
+	uint32_t (*S0)[2], (*S1)[2], (*S2)[2];
+	size_t w;
+} pwxform_ctx_t;
+
+/**
+ * pwxform(B):
+ * Transform the provided block using the provided S-boxes.
+ */
+static void pwxform(uint32_t *B, pwxform_ctx_t *ctx)
+{
+	uint32_t (*X)[PWXsimple][2] = (uint32_t (*)[PWXsimple][2])B;
+	uint32_t (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2;
+	uint32_t Smask = ctx->Smask;
+	size_t w = ctx->w;
+	size_t i, j, k;
+
+	/* 1: for i = 0 to PWXrounds - 1 do */
+	for (i = 0; i < ctx->PWXrounds; i++) {
+		/* 2: for j = 0 to PWXgather - 1 do */
+		for (j = 0; j < PWXgather; j++) {
+			uint32_t xl = X[j][0][0];
+			uint32_t xh = X[j][0][1];
+			uint32_t (*p0)[2], (*p1)[2];
+
+			/* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */
+			p0 = S0 + (xl & Smask) / sizeof(*S0);
+			/* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */
+			p1 = S1 + (xh & Smask) / sizeof(*S1);
+
+			/* 5: for k = 0 to PWXsimple - 1 do */
+			for (k = 0; k < PWXsimple; k++) {
+				uint64_t x, s0, s1;
+
+				/* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) xor S1_{p1,k} */
+				s0 = ((uint64_t)p0[k][1] << 32) + p0[k][0];
+				s1 = ((uint64_t)p1[k][1] << 32) + p1[k][0];
+
+				xl = X[j][k][0];
+				xh = X[j][k][1];
+
+				x = (uint64_t)xh * xl;
+				x += s0;
+				x ^= s1;
+
+				X[j][k][0] = x;
+				X[j][k][1] = x >> 32;
+			}
+
+			if (ctx->version != YESPOWER_0_5 &&
+			    (i == 0 || j < PWXgather / 2)) {
+				if (j & 1) {
+					for (k = 0; k < PWXsimple; k++) {
+						S1[w][0] = X[j][k][0];
+						S1[w][1] = X[j][k][1];
+						w++;
+					}
+				} else {
+					for (k = 0; k < PWXsimple; k++) {
+						S0[w + k][0] = X[j][k][0];
+						S0[w + k][1] = X[j][k][1];
+					}
+				}
+			}
+		}
+	}
+
+	if (ctx->version != YESPOWER_0_5) {
+		/* 14: (S0, S1, S2) <-- (S2, S0, S1) */
+		ctx->S0 = S2;
+		ctx->S1 = S0;
+		ctx->S2 = S1;
+		/* 15: w <-- w mod 2^Swidth */
+		ctx->w = w & ((1 << ctx->Swidth) * PWXsimple - 1);
+	}
+}
+
+/**
+ * blockmix_pwxform(B, ctx, r):
+ * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B).  The input B must be
+ * 128r bytes in length.
+ */
+static void blockmix_pwxform(uint32_t *B, pwxform_ctx_t *ctx, size_t r)
+{
+	uint32_t X[PWXwords];
+	size_t r1, i;
+
+	/* Convert 128-byte blocks to PWXbytes blocks */
+	/* 1: r_1 <-- 128r / PWXbytes */
+	r1 = 128 * r / PWXbytes;
+
+	/* 2: X <-- B'_{r_1 - 1} */
+	blkcpy(X, &B[(r1 - 1) * PWXwords], PWXwords);
+
+	/* 3: for i = 0 to r_1 - 1 do */
+	for (i = 0; i < r1; i++) {
+		/* 4: if r_1 > 1 */
+		if (r1 > 1) {
+			/* 5: X <-- X xor B'_i */
+			blkxor(X, &B[i * PWXwords], PWXwords);
+		}
+
+		/* 7: X <-- pwxform(X) */
+		pwxform(X, ctx);
+
+		/* 8: B'_i <-- X */
+		blkcpy(&B[i * PWXwords], X, PWXwords);
+	}
+
+	/* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */
+	i = (r1 - 1) * PWXbytes / 64;
+
+	/* 11: B_i <-- H(B_i) */
+	salsa20(&B[i * 16], ctx->salsa20_rounds);
+
+#if 1 /* No-op with our current pwxform settings, but do it to make sure */
+	/* 12: for i = i + 1 to 2r - 1 do */
+	for (i++; i < 2 * r; i++) {
+		/* 13: B_i <-- H(B_i xor B_{i-1}) */
+		blkxor(&B[i * 16], &B[(i - 1) * 16], 16);
+		salsa20(&B[i * 16], ctx->salsa20_rounds);
+	}
+#endif
+}
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static uint32_t integerify(const uint32_t *B, size_t r)
+{
+/*
+ * Our 32-bit words are in host byte order.  Also, they are SIMD-shuffled, but
+ * we only care about the least significant 32 bits anyway.
+ */
+	const uint32_t *X = &B[(2 * r - 1) * 16];
+	return X[0];
+}
+
+/**
+ * p2floor(x):
+ * Largest power of 2 not greater than argument.
+ */
+static uint32_t p2floor(uint32_t x)
+{
+	uint32_t y;
+	while ((y = x & (x - 1)))
+		x = y;
+	return x;
+}
+
+/**
+ * wrap(x, i):
+ * Wrap x to the range 0 to i-1.
+ */
+static uint32_t wrap(uint32_t x, uint32_t i)
+{
+	uint32_t n = p2floor(i);
+	return (x & (n - 1)) + (i - n);
+}
+
+/**
+ * smix1(B, r, N, V, X, ctx):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage X must be 128r bytes in length.
+ */
+static void smix1(uint32_t *B, size_t r, uint32_t N,
+    uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx)
+{
+	size_t s = 32 * r;
+	uint32_t i, j;
+	size_t k;
+
+	/* 1: X <-- B */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			X[k * 16 + i] = B[k * 16 + (i * 5 % 16)];
+
+	if (ctx->version != YESPOWER_0_5) {
+		for (k = 1; k < r; k++) {
+			blkcpy(&X[k * 32], &X[(k - 1) * 32], 32);
+			blockmix_pwxform(&X[k * 32], ctx, 1);
+		}
+	}
+
+	/* 2: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i++) {
+		/* 3: V_i <-- X */
+		blkcpy(&V[i * s], X, s);
+
+		if (i > 1) {
+			/* j <-- Wrap(Integerify(X), i) */
+			j = wrap(integerify(X, r), i);
+
+			/* X <-- X xor V_j */
+			blkxor(X, &V[j * s], s);
+		}
+
+		/* 4: X <-- H(X) */
+		if (V != ctx->S)
+			blockmix_pwxform(X, ctx, r);
+		else
+			blockmix_salsa(X, ctx->salsa20_rounds);
+	}
+
+	/* B' <-- X */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			B[k * 16 + (i * 5 % 16)] = X[k * 16 + i];
+}
+
+/**
+ * smix2(B, r, N, Nloop, V, X, ctx):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage X must be 128r bytes in length.  The value N must be a power of 2
+ * greater than 1.
+ */
+static void smix2(uint32_t *B, size_t r, uint32_t N, uint32_t Nloop,
+    uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx)
+{
+	size_t s = 32 * r;
+	uint32_t i, j;
+	size_t k;
+
+	/* X <-- B */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			X[k * 16 + i] = B[k * 16 + (i * 5 % 16)];
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < Nloop; i++) {
+		/* 7: j <-- Integerify(X) mod N */
+		j = integerify(X, r) & (N - 1);
+
+		/* 8.1: X <-- X xor V_j */
+		blkxor(X, &V[j * s], s);
+		/* V_j <-- X */
+		if (Nloop != 2)
+			blkcpy(&V[j * s], X, s);
+
+		/* 8.2: X <-- H(X) */
+		blockmix_pwxform(X, ctx, r);
+	}
+
+	/* 10: B' <-- X */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			B[k * 16 + (i * 5 % 16)] = X[k * 16 + i];
+}
+
+/**
+ * smix(B, r, N, p, t, V, X, ctx):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage
+ * X must be 128r bytes in length.  The value N must be a power of 2 and at
+ * least 16.
+ */
+static void smix(uint32_t *B, size_t r, uint32_t N,
+    uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx)
+{
+	uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */
+	uint32_t Nloop_rw = Nloop_all;
+
+	Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */
+	if (ctx->version == YESPOWER_0_5) {
+		Nloop_rw &= ~(uint32_t)1; /* round down to even */
+	} else {
+		Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */
+	}
+
+	smix1(B, 1, ctx->Sbytes / 128, ctx->S, X, ctx);
+	smix1(B, r, N, V, X, ctx);
+	smix2(B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx);
+	smix2(B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx);
+}
+
+/**
+ * yespower(local, src, srclen, params, dst):
+ * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
+ *
+ * Return 0 on success; or -1 on error.
+ */
+int yespower_b2b_ref( yespower_local_t *local, const uint8_t *src,
+                      size_t srclen, const yespower_params_t *params,
+                      yespower_binary_t *dst, int thrid ) 
+{
+	yespower_version_t version = params->version;
+	uint32_t N = params->N;
+	uint32_t r = params->r;
+	const uint8_t *pers = params->pers;
+	size_t perslen = params->perslen;
+	int retval = -1;
+	size_t B_size, V_size;
+	uint32_t *B, *V, *X, *S;
+	pwxform_ctx_t ctx;
+   uint8_t init_hash[32];
+   sph_blake2b_ctx blake2b_ctx;
+
+	/* Sanity-check parameters */
+	if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) ||
+	    N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
+	    (N & (N - 1)) != 0 || r < rmin ||
+	    (!pers && perslen)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Allocate memory */
+	B_size = (size_t)128 * r;
+	V_size = B_size * N;
+	if ((V = malloc(V_size)) == NULL)
+		return -1;
+	if ((B = malloc(B_size)) == NULL)
+		goto free_V;
+	if ((X = malloc(B_size)) == NULL)
+		goto free_B;
+	ctx.version = version;
+	if (version == YESPOWER_0_5) {
+		ctx.salsa20_rounds = 8;
+		ctx.PWXrounds = PWXrounds_0_5;
+		ctx.Swidth = Swidth_0_5;
+		ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth);
+	} else {
+		ctx.salsa20_rounds = 2;
+		ctx.PWXrounds = PWXrounds_1_0;
+		ctx.Swidth = Swidth_1_0;
+		ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth);
+	}
+	if ((S = malloc(ctx.Sbytes)) == NULL)
+		goto free_X;
+	ctx.S = S;
+	ctx.S0 = (uint32_t (*)[2])S;
+	ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple;
+	ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple;
+	ctx.Smask = Swidth_to_Smask(ctx.Swidth);
+	ctx.w = 0;
+
+    sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 );
+    sph_blake2b_update( &blake2b_ctx, src, srclen );
+    sph_blake2b_final( &blake2b_ctx, init_hash );
+//	SHA256_Buf(src, srclen, (uint8_t *)sha256);
+
+	if (version != YESPOWER_0_5) {
+		if (pers) {
+			src = pers;
+			srclen = perslen;
+		} else {
+			srclen = 0;
+		}
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+    pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1,
+                        (uint8_t*)B, B_size );
+
+//   PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256),
+//	    src, srclen, 1, (uint8_t *)B, B_size);
+
+   memcpy(init_hash, B, sizeof(init_hash));
+   
+//	blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
+
+	/* 3: B_i <-- MF(B_i, N) */
+	smix(B, r, N, V, X, &ctx);
+
+/*
+	if (version == YESPOWER_0_5) {
+		PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256),
+		    (uint8_t *)B, B_size, 1, (uint8_t *)dst, sizeof(*dst));
+
+		if (pers) {
+			HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
+			    (uint8_t *)sha256);
+			SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
+		}
+	} else {
+
+   HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64,
+		    sha256, sizeof(sha256), (uint8_t *)dst);
+	}
+*/
+
+    hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash));
+   
+	/* Success! */
+	retval = 1;
+
+	/* Free memory */
+	free(S);
+free_X:
+	free(X);
+free_B:
+	free(B);
+free_V:
+	free(V);
+
+	return retval;
+}
+
+int yespower_b2b_tls_ref(const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst, int thrid )
+{
+/* The reference implementation doesn't use thread-local storage */
+	return yespower_b2b_ref(NULL, src, srclen, params, dst, thrid );
+}
+
+int yespower_b2b_init_local_ref(yespower_local_t *local)
+{
+/* The reference implementation doesn't use the local structure */
+	local->base = local->aligned = NULL;
+	local->base_size = local->aligned_size = 0;
+	return 0;
+}
+
+int yespower_b2b_free_local_ref(yespower_local_t *local)
+{
+/* The reference implementation frees its memory in yespower() */
+	(void)local; /* unused */
+	return 0;
+}
diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c
index 54d119e..35ad17b 100644
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -35,12 +35,18 @@ __thread sha256_context sha256_prehash_ctx;
 
 // YESPOWER
 
-int yespower_hash( const char *input, char *output, uint32_t len, int thrid )
+int yespower_hash( const char *input, char *output, int thrid )
 {
-   return yespower_tls( input, len, &yespower_params,
+   return yespower_tls( input, 80, &yespower_params,
            (yespower_binary_t*)output, thrid ); 
 }
 
+int yespower_hash_ref( const char *input, char *output, int thrid )
+{
+   return yespower_tls_ref( input, 80, &yespower_params,
+           (yespower_binary_t*)output, thrid );
+}
+
 int scanhash_yespower( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -62,7 +68,7 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
    sha256_update( &sha256_prehash_ctx, endiandata, 64 );
 
    do {
-      if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) )
+      if ( algo_gate.hash( (char*)endiandata, (char*)vhash, thr_id ) )
       if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
       {
           be32enc( pdata+19, n );
@@ -77,9 +83,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
 
 // YESPOWER-B2B
 
-int yespower_b2b_hash( const char *input, char *output, uint32_t len, int thrid )
+int yespower_b2b_hash( const char *input, char *output, int thrid )
 {
-  return yespower_b2b_tls( input, len, &yespower_params, (yespower_binary_t*)output, thrid );
+  return yespower_b2b_tls( input, 80, &yespower_params, (yespower_binary_t*)output, thrid );
+}
+
+int yespower_b2b_hash_ref( const char *input, char *output, int thrid )
+{
+  return yespower_b2b_tls_ref( input, 80, &yespower_params, (yespower_binary_t*)output, thrid );
 }
 
 int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
@@ -99,7 +110,7 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce,
    endiandata[19] = n;
 
    do {
-      if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) )
+      if ( algo_gate.hash( (char*) endiandata, (char*) vhash, thr_id ) )
       if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
       {
           be32enc( pdata+19, n );
@@ -140,7 +151,11 @@ bool register_yespower_algo( algo_gate_t* gate )
 
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash      = (void*)&scanhash_yespower;
+#if defined(__aarch64__)
+  gate->hash          = (void*)&yespower_hash_ref;
+#else
   gate->hash          = (void*)&yespower_hash;
+#endif
   opt_target_factor = 65536.0;
   return true;
 };
@@ -165,6 +180,11 @@ bool register_yescrypt_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
+#if defined(__aarch64__)
+   gate->hash          = (void*)&yespower_hash_ref;
+#else
+   gate->hash          = (void*)&yespower_hash;
+#endif
    yespower_params.version = YESPOWER_0_5;
    opt_target_factor = 65536.0;
 
@@ -197,7 +217,12 @@ bool register_yescrypt_algo( algo_gate_t* gate )
 bool register_yescryptr8_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
-   gate->scanhash   = (void*)&scanhash_yespower;
+   gate->scanhash      = (void*)&scanhash_yespower;
+#if defined(__aarch64__)
+   gate->hash          = (void*)&yespower_hash_ref;
+#else
+   gate->hash          = (void*)&yespower_hash;
+#endif
    yespower_params.version = YESPOWER_0_5;
    yespower_params.N       = 2048;
    yespower_params.r       = 8;
@@ -211,6 +236,11 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
+#if defined(__aarch64__)
+   gate->hash          = (void*)&yespower_hash_ref;
+#else
+   gate->hash          = (void*)&yespower_hash;
+#endif
    yespower_params.version = YESPOWER_0_5;
    yespower_params.N       = 4096;
    yespower_params.r       = 16;
@@ -224,6 +254,11 @@ bool register_yescryptr32_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yespower;
+#if defined(__aarch64__)
+   gate->hash          = (void*)&yespower_hash_ref;
+#else
+   gate->hash          = (void*)&yespower_hash;
+#endif
    yespower_params.version = YESPOWER_0_5;
    yespower_params.N       = 4096;
    yespower_params.r       = 32;
@@ -251,7 +286,11 @@ bool register_power2b_algo( algo_gate_t* gate )
 
   gate->optimizations = SSE2_OPT | AVX2_OPT;
   gate->scanhash      = (void*)&scanhash_yespower_b2b;
+#if defined(__aarch64__)
+  gate->hash          = (void*)&yespower_b2b_hash_ref;
+#else
   gate->hash          = (void*)&yespower_b2b_hash;
+#endif
   opt_target_factor = 65536.0;
   return true;
 };
@@ -291,7 +330,11 @@ bool register_yespower_b2b_algo( algo_gate_t* gate )
 
   gate->optimizations = SSE2_OPT;
   gate->scanhash      = (void*)&scanhash_yespower_b2b;
+#if defined(__aarch64__)
+  gate->hash          = (void*)&yespower_b2b_hash_ref;
+#else
   gate->hash          = (void*)&yespower_b2b_hash;
+#endif
   opt_target_factor = 65536.0;
   return true;
 };
diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c
index 56df20d..04aa9ac 100644
--- a/algo/yespower/yespower-opt.c
+++ b/algo/yespower/yespower-opt.c
@@ -38,6 +38,8 @@
  * preparation for a hard-fork).
  */
 
+#if !defined(__aarch64__)
+
 #ifndef _YESPOWER_OPT_C_PASS_
 #define _YESPOWER_OPT_C_PASS_ 1
 #endif
@@ -1358,3 +1360,5 @@ int yespower_free_local(yespower_local_t *local)
 	return free_region(local);
 }
 #endif
+
+#endif // !aarch64
diff --git a/algo/yespower/yespower-ref.c b/algo/yespower/yespower-ref.c
index e9a498a..c390b38 100644
--- a/algo/yespower/yespower-ref.c
+++ b/algo/yespower/yespower-ref.c
@@ -453,7 +453,7 @@ static void smix(uint32_t *B, size_t r, uint32_t N,
  *
  * Return 0 on success; or -1 on error.
  */
-int yespower( yespower_local_t *local, const uint8_t *src, size_t srclen,
+int yespower_ref( yespower_local_t *local, const uint8_t *src, size_t srclen,
     const yespower_params_t *params, yespower_binary_t *dst, int thrid ) 
 {
 	yespower_version_t version = params->version;
@@ -556,14 +556,14 @@ free_V:
 	return retval;
 }
 
-int yespower_tls(const uint8_t *src, size_t srclen,
+int yespower_tls_ref(const uint8_t *src, size_t srclen,
     const yespower_params_t *params, yespower_binary_t *dst, int thrid )
 {
 /* The reference implementation doesn't use thread-local storage */
-	return yespower(NULL, src, srclen, params, dst, thrid );
+	return yespower_ref(NULL, src, srclen, params, dst, thrid );
 }
 
-int yespower_init_local(yespower_local_t *local)
+int yespower_init_local_ref(yespower_local_t *local)
 {
 /* The reference implementation doesn't use the local structure */
 	local->base = local->aligned = NULL;
@@ -571,7 +571,7 @@ int yespower_init_local(yespower_local_t *local)
 	return 0;
 }
 
-int yespower_free_local(yespower_local_t *local)
+int yespower_free_local_ref(yespower_local_t *local)
 {
 /* The reference implementation frees its memory in yespower() */
 	(void)local; /* unused */
diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h
index aa19004..c93663d 100644
--- a/algo/yespower/yespower.h
+++ b/algo/yespower/yespower.h
@@ -155,6 +155,21 @@ extern int yespower_8way_tls( const __m256i *src, size_t srclen,
 
 #endif // AVX2
 
+extern int yespower_ref(yespower_local_t *local,
+    const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst, int thrid);
+
+extern int yespower_b2b_ref(yespower_local_t *local,
+    const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst, int thrid );
+
+extern int yespower_tls_ref(const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst, int thr_id);
+
+extern int yespower_b2b_tls_ref(const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst, int thr_id);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/build-allarch.sh b/build-allarch.sh
index 5d4bddf..e20aa98 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
 
 # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
 make distclean || echo clean
@@ -13,7 +13,7 @@ rm -f config.status
 CFLAGS="-O3 -march=icelake-client -Wall -fno-common" ./configure --with-curl
 # Rocketlake needs gcc-11
 #CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes
 
@@ -34,7 +34,7 @@ rm -f config.status
 # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer.
 CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall -fno-common " ./configure --with-curl
 #CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall -fno-common " ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-zen4
 
@@ -43,7 +43,7 @@ make clean || echo clean
 rm -f config.status
 #CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
 CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-zen3
 
@@ -51,7 +51,7 @@ mv cpuminer cpuminer-zen3
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=skylake-avx512 -maes -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-avx512
 
@@ -60,7 +60,7 @@ make clean || echo done
 rm -f config.status
 # vaes doesn't include aes
 CFLAGS="-O3 -maes -mavx2 -msha -mvaes -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-avx2-sha-vaes
 
@@ -69,7 +69,7 @@ make clean || echo done
 rm -f config.status
 #CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl
 CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-avx2-sha
 
@@ -78,7 +78,7 @@ make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES with core-avx2
 CFLAGS="-O3 -march=core-avx2 -maes -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-avx2
 
@@ -86,7 +86,7 @@ mv cpuminer cpuminer-avx2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-avx
 
@@ -94,7 +94,7 @@ mv cpuminer cpuminer-avx
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-aes-sse42
 
@@ -102,7 +102,7 @@ mv cpuminer cpuminer-aes-sse42
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-sse42
 
@@ -110,7 +110,7 @@ mv cpuminer cpuminer-sse42
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-ssse3
 
@@ -118,14 +118,22 @@ mv cpuminer cpuminer-ssse3
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 mv cpuminer cpuminer-sse2
 
+# X86_64
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=x86-64 -Wall -fno-common" ./configure --with-curl
+make -j $nproc
+strip -s cpuminer
+mv cpuminer cpuminer-x64
+
 # Native to host CPU
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
-make -j 8
+make -j $nproc
 strip -s cpuminer
 
diff --git a/build-armv8.sh b/build-armv8.sh
new file mode 100755
index 0000000..0d68f1f
--- /dev/null
+++ b/build-armv8.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Linux build
+
+make distclean || echo clean
+
+rm -f config.status
+./autogen.sh || echo done
+
+CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure  --with-curl  --host=aarch64-cortexa76-elf --build=x86_64-pc-linux-gnu --target=aarch64-cortexa76-elf
+#CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure  --with-curl
+
+make -j $nproc
+
+strip -s cpuminer
diff --git a/build-avx2.sh b/build-avx2.sh
index 7a12473..aeca888 100755
--- a/build-avx2.sh
+++ b/build-avx2.sh
@@ -22,6 +22,6 @@ rm -f config.status
 CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 
-make -j 4
+make -j $nproc
 
 strip -s cpuminer
diff --git a/build.sh b/build.sh
index 39bf5f6..c6f895c 100755
--- a/build.sh
+++ b/build.sh
@@ -15,6 +15,6 @@ rm -f config.status
 #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 
-make -j 4
+make -j $nproc
 
 strip -s cpuminer
diff --git a/clean-all.sh b/clean-all.sh
index 902a7ef..855b54f 100755
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 > /dev/null
 
-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe > /dev/null
+rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
 
 make distclean > /dev/null
diff --git a/compat/sha3-defs.h b/compat/sha3-defs.h
index 1060737..1b3bb69 100644
--- a/compat/sha3-defs.h
+++ b/compat/sha3-defs.h
@@ -1,7 +1,6 @@
 
 #ifndef DEFS_X5_H__
 #define DEFS_X5_H__
-#include <emmintrin.h>
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
 typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
diff --git a/configure b/configure
index c863e08..a8303df 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.3.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.23.3'
-PACKAGE_STRING='cpuminer-opt 3.23.3'
+PACKAGE_VERSION='3.23.4'
+PACKAGE_STRING='cpuminer-opt 3.23.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.23.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1432,7 +1432,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.23.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.23.3
+cpuminer-opt configure 3.23.4
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.23.3, which was
+It was created by cpuminer-opt $as_me 3.23.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.23.3'
+ VERSION='3.23.4'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.23.3, which was
+This file was extended by cpuminer-opt $as_me 3.23.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 3.23.3
+cpuminer-opt config.status 3.23.4
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index f2fd87a..a7cd526 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.23.3])
+AC_INIT([cpuminer-opt], [3.23.4])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/configure~ b/configure~
index 59e43a1..15b201e 100755
--- a/configure~
+++ b/configure~
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.23.3'
-PACKAGE_STRING='cpuminer-opt 3.23.3'
+PACKAGE_VERSION='3.23.4'
+PACKAGE_STRING='cpuminer-opt 3.23.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.23.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.23.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.23.3
+cpuminer-opt configure 3.23.4
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.23.3, which was
+It was created by cpuminer-opt $as_me 3.23.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.23.3'
+ VERSION='3.23.4'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.23.3, which was
+This file was extended by cpuminer-opt $as_me 3.23.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.23.3
+cpuminer-opt config.status 3.23.4
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/cpu-miner.c b/cpu-miner.c
index 06c1fe5..2746ca8 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -880,8 +880,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
    }
 
    // reverse the bytes in target
-   casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
-   casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
+   casti_v128( work->target, 0 ) = v128_bswap128( casti_v128( target, 1 ) );
+   casti_v128( work->target, 1 ) = v128_bswap128( casti_v128( target, 0 ) );
    net_diff = work->targetdiff = hash_to_diff( work->target );
 
    tmp = json_object_get( val, "workid" );
@@ -987,6 +987,17 @@ void report_summary_log( bool force )
 {
    struct timeval now, et, uptime, start_time;
 
+  if ( rejected_share_count )
+  {
+     if ( rejected_share_count > ( submitted_share_count * .5 ) )
+     {
+        applog(LOG_ERR,"Excessive rejected share rate, exiting...");
+        exit(1);
+     } 
+     else if ( rejected_share_count > ( submitted_share_count * .1 ) )
+       applog(LOG_WARNING,"High rejected share rate, check settings.");
+   }
+
    gettimeofday( &now, NULL );
    timeval_subtract( &et, &now, &five_min_start );
 
@@ -2827,28 +2838,44 @@ static void show_credits()
 
 #define check_cpu_capability() cpu_capability( false )
 #define display_cpu_capability() cpu_capability( true )
+
+#if defined(__aarch64__)
+
+#define XSTR(x) STR(x)
+#define STR(x) #x
+
+#pragma message "Building for armv" XSTR(__ARM_ARCH)  
+
+#endif
+
 static bool cpu_capability( bool display_only )
 {
      char cpu_brand[0x40];
-     bool cpu_has_sse2   = has_sse2();
-     bool cpu_has_sse42  = has_sse42();
-     bool cpu_has_avx    = has_avx();
-     bool cpu_has_avx2   = has_avx2();
-     bool cpu_has_avx512 = has_avx512();
-     bool cpu_has_avx10  = has_avx10();
-     bool cpu_has_aes    = has_aes_ni();
-     bool cpu_has_vaes   = has_vaes();
-     bool cpu_has_sha    = has_sha();
-     bool cpu_has_sha512 = has_sha512();
-     bool sw_has_sse2   = false;
-     bool sw_has_sse42  = false;
-     bool sw_has_avx    = false;
-     bool sw_has_avx2   = false;
-     bool sw_has_avx512 = false;
-     bool sw_has_aes    = false;
-     bool sw_has_vaes   = false;
-     bool sw_has_sha    = false;
-     bool sw_has_sha512 = false;
+     bool cpu_has_aarch64 = cpu_arch_aarch64();
+     bool cpu_has_x86_64  = cpu_arch_x86_64();
+     bool cpu_has_sse2    = has_sse2();    // X86_64 only
+     bool cpu_has_sse42   = has_sse42();
+     bool cpu_has_avx     = has_avx();
+     bool cpu_has_avx2    = has_avx2();
+     bool cpu_has_avx512  = has_avx512();
+     bool cpu_has_avx10   = has_avx10();
+     bool cpu_has_aes     = has_aes_ni();  // x86_64 or AArch64 AES
+     bool cpu_has_vaes    = has_vaes();
+     bool cpu_has_sha     = has_sha();     // x86_64 or AArch64
+     bool cpu_has_sha512  = has_sha512();
+     bool sw_has_x86_64   = false;
+     bool sw_has_aarch64  = false;
+     int  sw_arm_arch     = 0;
+     bool sw_has_neon     = false;
+     bool sw_has_sse2     = false;        // x86_64 or ARM NEON
+     bool sw_has_sse42    = false;
+     bool sw_has_avx      = false;
+     bool sw_has_avx2     = false;
+     bool sw_has_avx512   = false;
+     bool sw_has_aes      = false;
+     bool sw_has_vaes     = false;
+     bool sw_has_sha      = false;        // x86_64 or AArch64 SHA2
+     bool sw_has_sha512   = false;        // x86_64 or AArch64 SHA3
      set_t algo_features = algo_gate.optimizations;
      bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
      bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
@@ -2868,9 +2895,22 @@ static bool cpu_capability( bool display_only )
      bool use_vaes;
      bool use_sha;
      bool use_sha512;
+     bool use_neon;
      bool use_none;
 
-     #ifdef __SSE2__
+     // x86_64
+     #if defined(__x86_64__)
+         sw_has_x86_64 = true;
+     #elif defined(__aarch64__)
+         sw_has_aarch64 = true;
+         #ifdef __ARM_NEON
+           sw_has_neon = true;
+         #endif
+         #ifdef __ARM_ARCH
+           sw_arm_arch = __ARM_ARCH;
+         #endif
+     #endif
+     #if defined(__SSE2__) || defined(__ARM_NEON)
          sw_has_sse2 = true;
      #endif
      #ifdef __SSE4_2__
@@ -2885,16 +2925,16 @@ static bool cpu_capability( bool display_only )
      #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
          sw_has_avx512 = true;
      #endif
-     #ifdef __AES__
+     #if defined(__AES__) || defined(__ARM_FEATURE_AES)
        sw_has_aes = true;
      #endif
      #ifdef __VAES__
          sw_has_vaes = true;
      #endif
-     #ifdef __SHA__
+     #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
          sw_has_sha = true;
      #endif
-     #ifdef __SHA512__
+     #if defined(__SHA512__) || defined(____ARM_FEATURE_SHA3)
          sw_has_sha512 = true;
      #endif
 
@@ -2912,28 +2952,43 @@ static bool cpu_capability( bool display_only )
      #endif
 
      printf("CPU features: ");
-     if      ( cpu_has_avx512 )    printf( " AVX512" );
-     else if ( cpu_has_avx2   )    printf( " AVX2  " );
-     else if ( cpu_has_avx    )    printf( " AVX   " );
-     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-     else if ( cpu_has_sse2   )    printf( " SSE2  " );
-     if      ( cpu_has_vaes   )    printf( " VAES"   );
-     else if ( cpu_has_aes    )    printf( "  AES"   );
-     if      ( cpu_has_sha512 )    printf( " SHA512" );
-     else if ( cpu_has_sha    )    printf( " SHA"    );
-     if      ( cpu_has_avx10  )    printf( " AVX10.%d-%d",
-                                    avx10_version(), avx10_vector_length() );
+     if ( cpu_has_x86_64  )
+     {
+                                    printf( " x86_64"  );
+       if      ( cpu_has_avx512 )   printf( " AVX512"  );
+       else if ( cpu_has_avx2   )   printf( " AVX2  "  );
+       else if ( cpu_has_avx    )   printf( " AVX   "  );
+       else if ( cpu_has_sse42  )   printf( " SSE4.2"  );
+       else if ( cpu_has_sse2   )   printf( " SSE2  "  );
+     }
+     else if   ( cpu_has_aarch64 )  printf( " AArch64 NEON" ); // NEON assumed
+     if        ( cpu_has_vaes   )   printf( " VAES"    );
+     else if   ( cpu_has_aes    )   printf( "  AES"    );
+     if        ( cpu_has_sha512 )   printf( " SHA512"  );
+     else if   ( cpu_has_sha    )   printf( " SHA256"  );
+     if        ( cpu_has_avx10  )   printf( " AVX10.%d-%d",
+                                      avx10_version(), avx10_vector_length() );
 
      printf("\nSW features:  ");
-     if      ( sw_has_avx512 )    printf( " AVX512" );
-     else if ( sw_has_avx2   )    printf( " AVX2  " );
-     else if ( sw_has_avx    )    printf( " AVX   " );
-     else if ( sw_has_sse42  )    printf( " SSE4.2" );
-     else if ( sw_has_sse2   )    printf( " SSE2  " );
-     if      ( sw_has_vaes   )    printf( " VAES"   );
-     else if ( sw_has_aes    )    printf( "  AES"   );
-     if      ( sw_has_sha512 )    printf( " SHA512" );
-     else if ( sw_has_sha    )    printf( " SHA"    );
+     if ( sw_has_x86_64 )
+     {                     
+                                     printf( " x86_64"  );
+        if      ( sw_has_avx512  )   printf( " AVX512"  );
+        else if ( sw_has_avx2    )   printf( " AVX2  "  );
+        else if ( sw_has_avx     )   printf( " AVX   "  );
+        else if ( sw_has_sse42   )   printf( " SSE4.2"  );
+        else if ( sw_has_sse2    )   printf( " SSE2  "  );
+     }
+     else if    ( sw_has_aarch64 ) 
+     {
+                                     printf( " AArch64" );
+        if      ( sw_arm_arch    )   printf( " armv%d", sw_arm_arch );
+        if      ( sw_has_neon    )   printf( " NEON"    );
+     }
+     if         ( sw_has_vaes    )   printf( " VAES"    );
+     else if    ( sw_has_aes     )   printf( "  AES"    );
+     if         ( sw_has_sha512  )   printf( " SHA512"  );
+     else if    ( sw_has_sha     )   printf( " SHA256"  );
 
      if ( !display_only )
      {
@@ -2948,7 +3003,7 @@ static bool cpu_capability( bool display_only )
            if      ( algo_has_vaes   )  printf( " VAES"   );
            else if ( algo_has_aes    )  printf( "  AES"   );
            if      ( algo_has_sha512 )  printf( " SHA512" );
-           else if ( algo_has_sha    )  printf( " SHA"    );
+           else if ( algo_has_sha    )  printf( " SHA256" );
         }
      }
      printf("\n");
@@ -2992,14 +3047,18 @@ static bool cpu_capability( bool display_only )
      use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
      use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
      use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512;
+     use_neon   = sw_has_aarch64 && sw_has_neon;
      use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512
-                || use_avx2 || use_sha || use_vaes || use_sha512 );
+                || use_avx2 || use_sha || use_vaes || use_sha512 || use_neon );
 
      // Display best options
      printf( "\nStarting miner with" );
-     if         ( use_none ) printf( " no optimizations" );
+     if         ( use_none   ) printf( " no optimizations" );
      else
      {
+        if ( cpu_has_aarch64 ) printf( " AArch64");
+        else
+                               printf( " x86_64" );
         if      ( use_avx512 ) printf( " AVX512" );
         else if ( use_avx2   ) printf( " AVX2"   );
         else if ( use_avx    ) printf( " AVX"    );
@@ -3008,13 +3067,16 @@ static bool cpu_capability( bool display_only )
         if      ( use_vaes   ) printf( " VAES"   );
         else if ( use_aes    ) printf( " AES"    );
         if      ( use_sha512 ) printf( " SHA512" );
-        else if ( use_sha    ) printf( " SHA"    );
+        else if ( use_sha    ) printf( " SHA256" );
+        if      ( use_neon   ) printf( " NEON"   );
      }
      printf( "...\n\n" );
 
      return true;
 }
-        
+
+
+
 void show_version_and_exit(void)
 {
         printf("\n built on " __DATE__
diff --git a/miner.h b/miner.h
index 6ff5d4a..3845a24 100644
--- a/miner.h
+++ b/miner.h
@@ -24,10 +24,6 @@
 
 #endif /* _MSC_VER */
 
-// prevent questions from ARM users that don't read the requirements.
-#if !defined(__x86_64__)
-#error "CPU architecture not supported. Consult the requirements for supported CPUs."
-#endif
 
 #include <stdbool.h>
 #include <inttypes.h>
@@ -126,11 +122,14 @@ static inline bool is_windows(void)
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif
 
+// deprecated, see simd-int.h
 #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
 #define WANT_BUILTIN_BSWAP
+/*
 #else
 #define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
                    | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+*/
 #endif
 
 static inline uint32_t swab32(uint32_t v)
@@ -138,7 +137,11 @@ static inline uint32_t swab32(uint32_t v)
 #ifdef WANT_BUILTIN_BSWAP
    return __builtin_bswap32(v);
 #else
-	return bswap_32(v);
+   return ( (x << 24) & 0xff000000u ) | ( (x <<  8) & 0x00ff0000u )
+        | ( (x >>  8) & 0x0000ff00u ) | ( (x >> 24) & 0x000000ffu )
+
+
+//   return bswap_32(v);
 #endif
 }
 
@@ -180,8 +183,6 @@ static inline void be32enc(void *pp, uint32_t x)
 }
 #endif
 
-// Deprecated in favour of mm64_bswap_32
-//
 // This is a poorman's SIMD instruction, use 64 bit instruction to encode 2
 // uint32_t. This function flips endian on two adjacent 32 bit quantities
 // aligned to 64 bits. If source is LE output is BE, and vice versa.
@@ -195,11 +196,8 @@ static inline void swab32_x2( uint64_t* dst, uint64_t src )
 
 static inline void swab32_array( uint32_t* dst_p, uint32_t* src_p, int n )
 {
-   // Assumes source is LE
-   for ( int i=0; i < n/2; i++ )
+   for ( int i = 0; i < n/2; i++ )
       swab32_x2( &((uint64_t*)dst_p)[i], ((uint64_t*)src_p)[i] );
-//   if ( n % 2 )
-//      be32enc( &dst_p[ n-1 ], src_p[ n-1 ] );
 }
 
 #if !HAVE_DECL_LE32ENC
diff --git a/simd-utils.h b/simd-utils.h
index 196fbe9..bae056b 100644
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -118,44 +118,41 @@
 //////////////////////////////////////////////////////////////////////////
 
 #include <inttypes.h>
-#include <x86intrin.h>
 #include <memory.h>
 #include <stdlib.h>
 #include <stdbool.h>
+#include <stdint.h>
+
+#if defined(__x86_64__)
+
+#include <x86intrin.h>
+
+#elif defined(__aarch64__)
+
+#include <arm_neon.h>
+
+#endif
 
-// 64 and 128 bit integers.
 #include "simd-utils/simd-int.h"
 
-#if defined(__MMX__)
-
-// 64 bit vectors
+// x86_64 MMX 64 bit vectors
 #include "simd-utils/simd-64.h"
 
-#if defined(__SSE2__)
-
-// 128 bit vectors
+// x86_64 SSE2 128 bit vectors
 #include "simd-utils/simd-128.h"
 
-#if defined(__AVX__)
-
-// 256 bit vector basics
+// x86_64 AVX2 256 bit vectors
 #include "simd-utils/simd-256.h"
 
-#if defined(__AVX2__)
-
-// Utilities that require AVX2 are defined in simd-256.h.
-
-// Skylake-X has all these
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-// 512 bit vectors
+// x86_64 AVX512 512 bit vectors
 #include "simd-utils/simd-512.h"
 
-#endif  // AVX512
-#endif  // AVX2
-#endif  // AVX
-#endif  // SSE2
-#endif  // MMX
+// move up after cleaning
+// CPU architectire abstraction
+//#include "simd-utils/simd-portable.h"
+
+// aarch64 neon 128 bit vectors
+#include "simd-utils/simd-neon.h"
 
 #include "simd-utils/intrlv.h"
 
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index a12d66e..2f04da3 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -86,39 +86,38 @@ static inline void extr_lane_2x32( void *dst, const void *src,
 
 // 4x32
 
-#if defined(__SSE4_1__)
+#if ( defined(__x86_64__) && defined(__SSE4_1__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
 
 #define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
-   D0 = mm128_mov32_32( S0, 1, S1, 0 ); \
-   D1 = mm128_mov32_32( S1, 0, S0, 1 ); \
-   D2 = mm128_mov32_32( S2, 0, S0, 2 ); \
-   D3 = mm128_mov32_32( S3, 0, S0, 3 ); \
-   D0 = mm128_mov32_32( D0, 2, S2, 0 ); \
-   D1 = mm128_mov32_32( D1, 2, S2, 1 ); \
-   D2 = mm128_mov32_32( D2, 1, S1, 2 ); \
-   D3 = mm128_mov32_32( D3, 1, S1, 3 ); \
-   D0 = mm128_mov32_32( D0, 3, S3, 0 ); \
-   D1 = mm128_mov32_32( D1, 3, S3, 1 ); \
-   D2 = mm128_mov32_32( D2, 3, S3, 2 ); \
-   D3 = mm128_mov32_32( D3, 2, S2, 3 ); 
+   D0 = v128_mov32( S0, 1, S1, 0 ); \
+   D1 = v128_mov32( S1, 0, S0, 1 ); \
+   D2 = v128_mov32( S2, 0, S0, 2 ); \
+   D3 = v128_mov32( S3, 0, S0, 3 ); \
+   D0 = v128_mov32( D0, 2, S2, 0 ); \
+   D1 = v128_mov32( D1, 2, S2, 1 ); \
+   D2 = v128_mov32( D2, 1, S1, 2 ); \
+   D3 = v128_mov32( D3, 1, S1, 3 ); \
+   D0 = v128_mov32( D0, 3, S3, 0 ); \
+   D1 = v128_mov32( D1, 3, S3, 1 ); \
+   D2 = v128_mov32( D2, 3, S3, 2 ); \
+   D3 = v128_mov32( D3, 2, S2, 3 ); 
 
 #define LOAD_SRCE( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \
-   S0 = _mm_load_si128( (const __m128i*)(src0) + (i0) ); \
-   S1 = _mm_load_si128( (const __m128i*)(src1) + (i1) ); \
-   S2 = _mm_load_si128( (const __m128i*)(src2) + (i2) ); \
-   S3 = _mm_load_si128( (const __m128i*)(src3) + (i3) );
+   S0 = v128_load( (const v128_t*)(src0) + (i0) ); \
+   S1 = v128_load( (const v128_t*)(src1) + (i1) ); \
+   S2 = v128_load( (const v128_t*)(src2) + (i2) ); \
+   S3 = v128_load( (const v128_t*)(src3) + (i3) );
 
 #define STORE_DEST( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \
-   _mm_store_si128( (__m128i*)(dst0) + (i0), D0 ); \
-   _mm_store_si128( (__m128i*)(dst1) + (i1), D1 ); \
-   _mm_store_si128( (__m128i*)(dst2) + (i2), D2 ); \
-   _mm_store_si128( (__m128i*)(dst3) + (i3), D3 ); 
-
+   v128_store( (v128_t*)(dst0) + (i0), D0 ); \
+   v128_store( (v128_t*)(dst1) + (i1), D1 ); \
+   v128_store( (v128_t*)(dst2) + (i2), D2 ); \
+   v128_store( (v128_t*)(dst3) + (i3), D3 ); 
 
 static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
                       const void *src2, const void *src3, const int bit_len )
 {
-   __m128i D0, D1, D2, D3, S0, S1, S2, S3;
+   v128_t D0, D1, D2, D3, S0, S1, S2, S3;
 
    LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 );
    ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -160,7 +159,7 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
 static inline void intrlv_4x32_512( void *dst, const void *src0,
                      const void *src1, const void *src2, const void *src3 )
 {
-   __m128i D0, D1, D2, D3, S0, S1, S2, S3;
+   v128_t D0, D1, D2, D3, S0, S1, S2, S3;
 
    LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 );
    ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -179,7 +178,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
 static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
                            void *dst3, const void *src, const int bit_len )
 {
-   __m128i D0, D1, D2, D3, S0, S1, S2, S3;
+   v128_t D0, D1, D2, D3, S0, S1, S2, S3;
 
    LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 );
    ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -221,7 +220,7 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
 static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
                            void *dst3, const void *src )
 {
-   __m128i D0, D1, D2, D3, S0, S1, S2, S3;
+   v128_t D0, D1, D2, D3, S0, S1, S2, S3;
 
    LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 );
    ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -382,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
    d0[15] = s[ 60];   d1[15] = s[ 61];    d2[15] = s[ 62];   d3[15] = s[ 63];
 }
 
-#endif   // SSE4_1 else SSE2
+#endif   // SSE4_1 else SSE2 or NEON
 
 static inline void extr_lane_4x32( void *d, const void *s,
                                    const int lane, const int bit_len )
@@ -408,7 +407,7 @@ static inline void extr_lane_4x32( void *d, const void *s,
 
 #if defined(__SSSE3__)
 
-static inline void mm128_bswap32_80( void *d, void *s )
+static inline void v128_bswap32_80( void *d, void *s )
 {
   const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
                                              0x0405060700010203 );
@@ -419,9 +418,20 @@ static inline void mm128_bswap32_80( void *d, void *s )
   casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
 }
 
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void v128_bswap32_80( void *d, void *s )
+{
+  casti_v128( d, 0 ) = v128_bswap32( casti_v128( s, 0 ) );
+  casti_v128( d, 1 ) = v128_bswap32( casti_v128( s, 1 ) );
+  casti_v128( d, 2 ) = v128_bswap32( casti_v128( s, 2 ) );
+  casti_v128( d, 3 ) = v128_bswap32( casti_v128( s, 3 ) );
+  casti_v128( d, 4 ) = v128_bswap32( casti_v128( s, 4 ) );
+}  
+
 #else
 
-static inline void mm128_bswap32_80( void *d, void *s )
+static inline void v128_bswap32_80( void *d, void *s )
 {
   ( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
   ( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );
@@ -447,7 +457,9 @@ static inline void mm128_bswap32_80( void *d, void *s )
 
 #endif
 
-static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
+#if defined(__SSE2__)
+
+static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
 {
   __m128i s0 = casti_m128i( src,0 );
   __m128i s1 = casti_m128i( src,1 );
@@ -502,6 +514,49 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
   casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
 }
 
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
+{
+  v128_t s0 = casti_v128( src,0 );
+  v128_t s1 = casti_v128( src,1 );
+  v128_t s2 = casti_v128( src,2 );
+  v128_t s3 = casti_v128( src,3 );
+  v128_t s4 = casti_v128( src,4 );
+
+  s0 = v128_bswap32( s0 );
+  s1 = v128_bswap32( s1 );
+  s2 = v128_bswap32( s2 );
+  s3 = v128_bswap32( s3 );
+  s4 = v128_bswap32( s4 );
+
+  casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
+  casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
+  casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
+  casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
+
+  casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
+  casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
+  casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
+  casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
+
+  casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
+  casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
+  casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
+  casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
+
+  casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
+  casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
+  casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
+  casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
+
+  casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
+  casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
+  casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
+  casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
+}
+
+#endif
 
 // 8x32
 
@@ -1365,8 +1420,51 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 //
 //     64 bit data
 
+// 2x64    (SSE2)
+
+static inline void intrlv_2x64( void *dst, const void *src0,
+                                const void *src1, const int bit_len )
+{
+   uint64_t *d = (uint64_t*)dst;;
+   const uint64_t *s0 = (const uint64_t*)src0;
+   const uint64_t *s1 = (const uint64_t*)src1;
+   d[ 0] = s0[ 0];    d[ 1] = s1[ 0];   d[ 2] = s0[ 1];    d[ 3] = s1[ 1];
+   d[ 4] = s0[ 2];    d[ 5] = s1[ 2];   d[ 6] = s0[ 3];    d[ 7] = s1[ 3];
+   if ( bit_len <= 256 ) return;
+   d[ 8] = s0[ 4];    d[ 9] = s1[ 4];   d[10] = s0[ 5];    d[11] = s1[ 5];
+   d[12] = s0[ 6];    d[13] = s1[ 6];   d[14] = s0[ 7];    d[15] = s1[ 7];
+   if ( bit_len <= 512 ) return;
+   d[16] = s0[ 8];    d[17] = s1[ 8];   d[18] = s0[ 9];    d[19] = s1[ 9];
+   if ( bit_len <= 640 ) return;
+   d[20] = s0[10];    d[21] = s1[10];   d[22] = s0[11];    d[23] = s1[11];
+   d[24] = s0[12];    d[25] = s1[12];   d[26] = s0[13];    d[27] = s1[13];
+   d[28] = s0[14];    d[29] = s1[14];   d[30] = s0[15];    d[31] = s1[15];
+}
+
+static inline void dintrlv_2x64( void *dst0, void *dst1,
+                                 const void *src, const int bit_len )
+{
+   uint64_t *d0 = (uint64_t*)dst0;
+   uint64_t *d1 = (uint64_t*)dst1;
+   const uint64_t *s = (const uint64_t*)src;
+
+   d0[ 0] = s[ 0];   d1[ 0] = s[ 1];   d0[ 1] = s[ 2];   d1[ 1] = s[ 3];
+   d0[ 2] = s[ 4];   d1[ 2] = s[ 5];   d0[ 3] = s[ 6];   d1[ 3] = s[ 7];
+   if ( bit_len <= 256 ) return;
+   d0[ 4] = s[ 8];   d1[ 4] = s[ 9];   d0[ 5] = s[10];   d1[ 5] = s[11];
+   d0[ 6] = s[12];   d1[ 6] = s[13];   d0[ 7] = s[14];   d1[ 7] = s[15];
+   if ( bit_len <= 512 ) return;
+   d0[ 8] = s[16];   d1[ 8] = s[17];   d0[ 9] = s[18];   d1[ 9] = s[19];
+   if ( bit_len <= 640 ) return;
+   d0[10] = s[20];   d1[10] = s[21];   d0[11] = s[22];   d1[11] = s[23];
+   d0[12] = s[24];   d1[12] = s[25];   d0[13] = s[26];   d1[13] = s[27];
+   d0[14] = s[28];   d1[14] = s[29];   d0[15] = s[30];   d1[15] = s[31];
+}
+
 // 4x64   (AVX2)
 
+#if defined(__SSE2__)
+
 static inline void intrlv_4x64( void *dst, const void *src0,
                     const void *src1, const void *src2, const void *src3,
                     const int bit_len )
@@ -1560,6 +1658,8 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
                           _mm256_castsi128_si256( s4 ), 0x55 );
 }
 
+#endif
+
 #if defined(__AVX512VL__) && defined(__AVX512VBMI__)
 
 //TODO Enable for AVX10_256 AVX10_512
@@ -1596,7 +1696,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ) );
 }
 
-#else
+#elif defined(__AVX2__)
 
 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
@@ -1626,12 +1726,14 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                           _mm256_castsi128_si256( s4 ), 0x55 );
 }
 
-#endif
+#endif   // AVX2
 
-#endif  // AVX2
+#endif  // SSE2
 
 // 8x64   (AVX512)
 
+#if defined(__SSE2__)
+
 static inline void intrlv_8x64( void *dst, const void *src0,
        const void *src1, const void *src2, const void *src3,
        const void *src4, const void *src5, const void *src6,
@@ -1948,6 +2050,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
    return;
 }
 
+#endif  // SSE2
+
 #if defined(__AVX512F__) && defined(__AVX512VL__)
 
 //TODO Enable for AVX10_512
@@ -2052,6 +2156,8 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
 
 // 2x128  (AVX2)
 
+#if defined(__SSE2__)
+
 static inline void intrlv_2x128( void *dst, const void *src0,
                                  const void *src1, const int bit_len )
 {
@@ -2195,6 +2301,8 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
    d0[3] = s[12];   d1[3] = s[13];    d2[3] = s[14];   d3[3] = s[15];
 }
 
+#endif  // SSE2
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 #if defined(__AVX512VBMI__)
@@ -2294,6 +2402,8 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
 
 // 4x64 -> 4x32
 
+#if defined(__SSE2__)
+
 static inline void rintrlv_4x64_4x32( void *dst, const void *src,
                                             const int  bit_len )
 {
@@ -2606,6 +2716,7 @@ static inline void rintrlv_8x32_4x128( void *dst0, void *dst1,
 
 // 2x128 -> 4x64
 
+
 static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
                                        const void *src1, const int bit_len )
 {
@@ -2872,6 +2983,7 @@ static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
 
 // 8x64 -> 2x256
 
+
 static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
                           void *dst3,  const void *src, const int bit_len )
 {
@@ -3050,6 +3162,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
    d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
 }
 
+#endif  // SSE2
+
 //
 // Some functions customized for mining.
 
diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h
index 6606abe..4b0ae61 100644
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -1,7 +1,7 @@
 #if !defined(SIMD_128_H__)
 #define SIMD_128_H__ 1
 
-#if defined(__SSE2__)
+#if defined(__x86_64__) && defined(__SSE2__)
 
 ///////////////////////////////////////////////////////////////////////////////
 //
@@ -34,6 +34,109 @@
 //
 ///////////////////////////////////////////////////////////////////////////////
 
+// direct translation of native intrinsics
+
+#define v128_t                         __m128i 
+
+#define v128_load                      _mm_load_si128
+#define v128_store                     _mm_store_si128
+
+// arithmetic
+#define v128_add64                     _mm_add_epi64
+#define v128_add32                     _mm_add_epi32
+#define v128_add16                     _mm_add_epi16
+#define v128_add8                      _mm_add_epi8
+
+#define v128_sub64                     _mm_sub_epi64
+#define v128_sub32                     _mm_sub_epi32
+#define v128_sub16                     _mm_sub_epi16
+#define v128_sub8                      _mm_sub_epi8
+
+// widen
+#define v128_mul64                     _mm_mul_epu64
+#define v128_mul32                     _mm_mul_epu32
+#define v128_mul16                     _mm_mul_epu16
+
+// save low half
+#define v128_mullo32                   _mm_mullo_epi32
+#define v128_mullo16                   _mm_mullo_epi16
+
+// compare
+#define v128_cmpeq64                   _mm_cmpeq_epi64
+#define v128_cmpeq32                   _mm_cmpeq_epi32
+#define v128_cmpeq16                   _mm_cmpeq_epi16
+
+#define v128_cmpgt64                   _mm_cmpgt_epi64
+#define v128_cmpgt32                   _mm_cmpgt_epi32
+#define v128_cmpgt16                   _mm_cmpgt_epi16
+
+#define v128_cmplt64                   _mm_cmplt_epi64
+#define v128_cmplt32                   _mm_cmplt_epi32
+#define v128_cmplt16                   _mm_cmplt_epi16
+
+// bit shift
+#define v128_sl64                      _mm_slli_epi64
+#define v128_sl32                      _mm_slli_epi32
+#define v128_sl16                      _mm_slli_epi16
+
+#define v128_sr64                      _mm_srli_epi64
+#define v128_sr32                      _mm_srli_epi32
+#define v128_sr16                      _mm_srli_epi16
+
+#define v128_sra64                     _mm_srai_epi64
+#define v128_sra32                     _mm_srai_epi32
+#define v128_sra16                     _mm_srai_epi16
+
+// logic
+#define v128_or                        _mm_or_si128
+#define v128_and                       _mm_and_si128
+#define v128_xor                       _mm_xor_si128
+#define v128_xorq                      _mm_xor_si128
+#define v128_andnot                    _mm_andnot_si128
+#define v128_xorandnot( v2, v1, v0 )   _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
+#define v128_xor3( v2, v1, v0 )        _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
+#define v128_and3( a, b, c )           _mm_and_si128( a, _mm_and_si128( b, c ) )
+#define v128_or3( a, b, c )            _mm_or_si128( a, _mm_or_si128( b, c ) )
+#define v128_xorand( a, b, c )         _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define v128_andxor( a, b, c )         _mm_and_si128( a, _mm_xor_si128( b, c ))
+#define v128_xoror( a, b, c )          _mm_xor_si128( a, _mm_or_si128( b, c ) )
+#define v128_orand( a, b, c )          _mm_or_si128( a, _mm_and_si128( b, c ) )
+#define v128_xnor( a, b )              mm128_not( _mm_xor_si128( a, b ) )
+#define v128_nor                       mm128_nor
+
+#define v128_alignr64                  mm128_alignr_64
+#define v128_alignr32                  mm128_alignr_32
+
+#if defined(__SSSE3__)
+
+#define v128_alignr8                   _mm_alignr_epi8
+
+#endif
+
+// NEON version uses vector mask
+#if defined(__SSE4_1__)
+
+#define v128_blend16                   _mm_blend_epi16
+
+#endif
+
+#define v128_unpacklo64                _mm_unpacklo_epi64
+#define v128_unpackhi64                _mm_unpackhi_epi64
+
+#define v128_unpacklo32                _mm_unpacklo_epi32
+#define v128_unpackhi32                _mm_unpackhi_epi32
+
+#define v128_unpacklo16                _mm_unpacklo_epi16
+#define v128_unpackhi16                _mm_unpackhi_epi16
+
+#define v128_unpacklo8                 _mm_unpacklo_epi8
+#define v128_unpackhi8                 _mm_unpackhi_epi8
+
+// AES
+#define v128_aesenc                    _mm_aesenc_si128
+#define v128_aesenclast                _mm_aesenclast_si128
+#define v128_aesdec                    _mm_aesdec_si128
+#define v128_aesdeclast                _mm_aesdeclast_si128
 
 // Used instead if casting.
 typedef union
@@ -43,14 +146,22 @@ typedef union
 } __attribute__ ((aligned (16))) m128_ovly;
 
 
-#define v128_64(i64)    _mm_set1_epi64x(i64)
-#define v128_32(i32)    _mm_set1_epi32(i32)
+#define mm128_64(i64)    _mm_set1_epi64x(i64)
+#define mm128_32(i32)    _mm_set1_epi32(i32)
+#define v128_32                        mm128_32
+#define v128_64                        mm128_64
+
+#define v128_set64                     _mm_set_epi64x
+#define v128_set_64                    v128_set64     // deprecated
+#define v128_set32                     _mm_set_epi32
+#define v128_set_32                    v128_set32    // deprecated
+
 
 // Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
 // that make these functions either unnecessary or inefficient.
 // In cases where an explicit move betweeen GP & SIMD registers is still
 // necessary the cvt, set, or set1 intrinsics can be used allowing the
-// compiler to exploilt new features to produce optimum code.
+// compiler to exploit new features to produce optimum code.
 static inline __m128i mm128_mov64_128( const uint64_t n )
 {
   __m128i a;
@@ -61,6 +172,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
 #endif
   return a;
 }
+#define v128_mov64( u64 )              mm128_mov64_128( u64 )
+
 
 static inline __m128i mm128_mov32_128( const uint32_t n )
 {
@@ -79,7 +192,9 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 //#define mm128_bcast_m32( v )   _mm_shuffle_epi32( v, 0x00 )
 
 // Pseudo constants
-#define m128_zero      _mm_setzero_si128()
+#define v128_zero      _mm_setzero_si128()
+#define m128_zero      v128_zero
+
 #define m128_one_128   mm128_mov64_128( 1 )
 
 // ASM avoids the need to initialize return variable to avoid compiler warning.
@@ -148,6 +263,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
 // Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
 #define mm128_mov32_32( v1, i1, v2, i2 ) \
   mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
+#define v128_mov32( dst, ld, src, ls ) mm128_mov32_32( dst, ld, src, ls )
 
 #endif  // SSE4_1
 
@@ -166,6 +282,21 @@ static inline __m128i mm128_not( const __m128i v )
 #define mm128_not( v )          _mm_xor_si128( v, m128_neg1 ) 
 
 #endif
+#define v128_not                       mm128_not
+
+
+static inline __m128i mm128_negate_64( __m128i v )
+{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
+#define v128_negate64                 mm128_negate_64
+
+static inline __m128i mm128_negate_32( __m128i v )
+{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
+#define v128_negate32                 mm128_negate_32
+
+static inline __m128i mm128_negate_16( __m128i v ) 
+{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
+#define v128_negate16                 mm128_negate_16
+
 
 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
@@ -173,6 +304,7 @@ static inline __m128i mm128_not( const __m128i v )
 
 #define mm128_add4_32( a, b, c, d ) \
    _mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
+#define v128_add4_32                   mm128_add4_32
 
 #define mm128_add4_16( a, b, c, d ) \
    _mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
@@ -191,13 +323,16 @@ static inline __m128i mm128_not( const __m128i v )
 // returns p as pointer to vector type
 #define castp_m128i(p) ((__m128i*)(p))
 
+
 // p = any aligned pointer
 // returns *p, watch your pointer arithmetic
 #define cast_m128i(p) (*((__m128i*)(p)))
+#define cast_v128                      cast_m128i
 
 // p = any aligned pointer, i = scaled array index
 // returns value p[i]
 #define casti_m128i(p,i) (((__m128i*)(p))[(i)])
+#define casti_v128                     casti_m128i
 
 // p = any aligned pointer, o = scaled offset
 // returns pointer p+o
@@ -211,12 +346,15 @@ static inline __m128i mm128_not( const __m128i v )
 
 static inline void memset_zero_128( __m128i *dst,  const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
+#define v128_memset_zero               memset_zero_128
 
 static inline void memset_128( __m128i *dst, const __m128i a, const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+#define v128_memset                    memset_128
 
 static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
+#define v128_memcpy                    memcpy_128
 
 #if defined(__AVX512VL__)
 //TODO Enable for AVX10_256
@@ -277,9 +415,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #define mm128_movmask_64( v ) \
    _mm_movemask_pd( (__m128d)(v) )
+#define v128_movmask64                 mm128_movmask_64
 
 #define mm128_movmask_32( v ) \
    _mm_movemask_ps( (__m128)(v) )
+#define v128_movmask32                 mm128_movmask_32
 
 //
 // Bit rotations
@@ -295,6 +435,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_64    _mm_rol_epi64
 #define mm128_ror_32    _mm_ror_epi32
 #define mm128_rol_32    _mm_rol_epi32
+#define mm128_ror_16    _mm_ror_epi16
+#define mm128_rol_16    _mm_rol_epi16
 
 #define mm128_rorx2_64( v1, v0, c ) \
    _mm_ror_epi64( v0, c ); \
@@ -326,6 +468,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_32( v, c ) \
    _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
 
+#define mm128_ror_16( v, c ) \
+   _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
+
+#define mm128_rol_16( v, c ) \
+   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
+
 #define mm128_rorx2_64( v1, v0, c ) \
 { \
  __m128i t0 = _mm_srli_epi64( v0, c ); \
@@ -368,6 +516,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 
 #endif   // AVX512 else SSE2
 
+#define v128_ror64                     mm128_ror_64
+#define v128 rol64                     mm128_rol_64
+
+#define v128_ror32                     mm128_ror_32
+#define v128_rol32                     mm128_rol_32
+
+#define v128_ror16                     mm128_ror_16
+#define v128_rol16                     mm128_rol_16
+
 // Cross lane shuffles
 //
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
@@ -383,11 +540,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes
 
 #define mm128_swap_64( v )     _mm_shuffle_epi32( v, 0x4e )
+#define v128_swap64                    mm128_swap_64
+
 #define mm128_shuflr_64        mm128_swap_64
 #define mm128_shufll_64        mm128_swap_64
 
 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
+#define v128_shuflr32                  mm128_shuflr_32
+
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
+#define v128_shufll32                  mm128_shufll_32
+
+#define mm128_rev_32( v )      _mm_shuffle_epi32( v, 0x1b )
+#define v128_rev32( v )                mm128_rev_32( v )
 
 /* Not used
 #if defined(__SSSE3__)
@@ -402,12 +567,14 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 //  Rotate 64 bit lanes
 
 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define v128_swap64_32                 mm128_swap64_32
+
 #define mm128_shuflr64_32     mm128_swap64_32
 #define mm128_shufll64_32     mm128_swap64_32
 
 //TODO Enable for AVX10_256
 #if defined(__AVX512VL__)
-  #define m1286_shuflr64_24( v )  _mm_ror_epi64( v, 24 )
+  #define m128_shuflr64_24( v )  _mm_ror_epi64( v, 24 )
 #elif defined(__SSSE3__) 
   #define mm128_shuflr64_24( v ) \
     _mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -415,6 +582,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
   #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
 #endif
+#define v128_shuflr64_24               mm128_shuflr64_24
+
 
 #if defined(__AVX512VL__)
   #define mm128_shuflr64_16( v )  _mm_ror_epi64( v, 16 )
@@ -425,6 +594,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
   #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
 #endif
+#define v128_shuflr64_16               mm128_shuflr64_16
 
 // Rotate 32 bit lanes
 
@@ -439,6 +609,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #endif
 #define mm128_shuflr32_16      mm128_swap32_16
 #define mm128_shufll32_16      mm128_swap32_16
+#define v128_swap32_16         mm128_swap32_16
+
 
 #if defined(__AVX512VL__)
   #define mm128_shuflr32_8( v )  _mm_ror_epi32( v, 8 )
@@ -449,6 +621,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #else
   #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
 #endif
+#define v128_shuflr32_8                mm128_shuflr32_8
 
 //
 // Endian byte swap.
@@ -549,6 +722,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 
 #endif // SSSE3 else SSE2
 
+#define v128_bswap32                   mm128_bswap_32
+#define v128_bswap64                   mm128_bswap_64
+#define v128_bswap128                  mm128_bswap_128
+#define v128_block_bswap32             mm128_block_bswap_32
+#define v128_block_bswap64             mm128_block_bswap_64
+
+
 // alignr instruction for 32 & 64 bit elements is only available with AVX512
 // but emulated here. Behaviour is consistent with Intel alignr intrinsics.
 
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index cb9b1b5..0275e39 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -22,7 +22,7 @@
 // Instructions that can move data across 128 bit lane boundary incur a
 // performance penalty over those that can't.
 
-#if defined(__AVX__)
+#if defined(__x86_64__) && defined(__AVX__)
 
 // Used instead of casting.
 typedef union
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index 42d3c5b..1a20997 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -14,7 +14,7 @@
 //   vectors. It is therefore not technically required for any 512 bit vector
 //   utilities defined below.
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 //  AVX512 intrinsics have a few changes from previous conventions.
 //
diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h
index c7508b0..8766f7a 100644
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -1,7 +1,7 @@
 #if !defined(SIMD_64_H__)
 #define SIMD_64_H__ 1
 
-#if defined(__MMX__) && defined(__SSE__)
+#if defined(__x86_64__) && defined(__MMX__) && defined(__SSE__)
 
 ////////////////////////////////////////////////////////////////
 //
diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h
index 1c4bbbe..7012857 100644
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -2,15 +2,84 @@
 #define SIMD_INT_H__ 1
 
 // Endian byte swap
+#if defined(__x86_64__)
+
 #define bswap_64    __builtin_bswap64
 #define bswap_32    __builtin_bswap32
 
+#elif defined(__aarch64__)
+
+//#pragma message "aarch64 fast bswap"
+
+static inline uint64_t bswap_64( uint64_t a )
+{
+   uint64_t b;
+   asm( "rev %0, %1\n\t" : "=r"(b) : "r"(a) );
+   return b;
+}
+
+static inline uint32_t bswap_32( uint32_t a )
+{
+   uint32_t b;
+   asm( "rev32 %0, %1\n\t" : "=r"(b) : "r"(a) );
+   return b;
+}
+
+#else
+
+#define bswap_64(x) \
+    ( ( ( (x) & 0x00000000FFFFFFFF ) << 32 ) \
+    | ( ( (x) & 0xFFFFFFFF00000000 ) >> 32 ) \
+    | ( ( (x) & 0x0000FFFF0000FFFF ) << 16 ) \
+    | ( ( (x) & 0xFFFF0000FFFF0000 ) >> 16 ) \
+    | ( ( (x) & 0x00FF00FF00FF00FF ) <<  8 ) \
+    | ( ( (x) & 0xFF00FF00FF00FF00 ) >>  8 ) )
+
+#define bswap_32(x) \
+   ( ( ( (x) << 24 ) & 0xff000000 ) | ( ((x) <<  8 ) & 0x00ff0000 ) \
+   | ( ( (x) >>  8 ) & 0x0000ff00 ) | ( ((x) >> 24 ) & 0x000000ff ) )
+
+#endif
+
 // Bit rotation
+#if defined(__x86_64__)
+
 #define rol64       __rolq
 #define ror64       __rorq
 #define rol32       __rold
 #define ror32       __rord
 
+#elif defined(__aarch64__)
+
+//#pragma message "aarch64 fast bit rotation"
+
+// "ror" instruction (intrinsic?) for 32 & 64 bits, args must determine size.
+
+static inline uint64_t ror64( uint64_t a, const int c )
+{
+   uint64_t b;
+   asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
+   return b;
+}
+#define rol64( a, c )     ror64( a, 64-(c) )
+
+static inline uint32_t ror32( uint32_t a, const int c )
+{
+   uint32_t b;
+   asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
+   return b;
+}
+#define rol32( a, c )     ror32( a, 32-(c) )
+
+#else
+
+#define ror64( x, c )    ( ( (x) >> (c) ) | ( (x) << (64-(c)) ) )
+#define rol64( x, c )    ( ( (x) << (c) ) | ( (x) >> (64-(c)) ) )
+#define ror32( x, c )    ( ( (x) >> (c) ) | ( (x) << (32-(c)) ) )
+#define rol32( x, c )    ( ( (x) << (c) ) | ( (x) >> (32-(c)) ) )
+
+#endif
+
 // Safe division, integer or floating point. For floating point it's as  
 // safe as 0 is precisely zero.
 // Returns safe_result if division by zero, typically zero.
diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h
new file mode 100644
index 0000000..205b847
--- /dev/null
+++ b/simd-utils/simd-neon.h
@@ -0,0 +1,242 @@
+#if defined(__aarch64__) && defined(__ARM_NEON)
+
+// targeted functions using generic names makes portable obsolete
+
+#define v128_t                         uint32x4_t
+
+// load & store
+#define v128_load( p )                 vld1q_u32( (uint32_t*)(p) )
+#define v128_store( p, v )             vst1q_u32( (uint32_t*)(p), v )
+
+// arithmetic
+#define v128_add64                     vaddq_u64
+#define v128_add32                     vaddq_u32
+#define v128_add16                     vaddq_u16
+#define v128_add8                      vaddq_u8
+
+#define v128_sub64                     vsubq_u64
+#define v128_sub32                     vsubq_u32
+#define v128_sub16                     vsubq_u16
+#define v128_sub8                      vsubq_u8
+
+// return low half
+#define v128_mullo64                   vmulq_u64
+#define v128_mullo32                   vmulq_u32
+#define v128_mullo16                   vmulq_u16
+
+// widen not working, use placeholders
+//#define v128_mul32                     vmull_u32  
+//#define v128_mul16                     vmull_u16
+#define v128_mul64                   vmulq_u64
+#define v128_mul32                   vmulq_u32
+#define v128_mul16                   vmulq_u16
+
+// compare
+#define v128_cmpeq64                   vceqq_u64
+#define v128_cmpeq32                   vceqq_u32
+#define v128_cmpeq16                   vceqq_u16
+
+#define v128_cmpgt64                   vcgtq_u64
+#define v128_cmpgt32                   vcgtq_u32
+#define v128_cmpgt16                   vcgtq_u16
+
+#define v128_cmplt64                   vcltq_u64
+#define v128_cmplt32                   vcltq_u32
+#define v128_cmplt16                   vcltq_u16
+
+// bit shift & rotate
+#define v128_sl64                      vshlq_n_u64
+#define v128_sl32                      vshlq_n_u32
+#define v128_sl16                      vshlq_n_u16
+
+#define v128_sr64                      vshrq_n_u64
+#define v128_sr32                      vshrq_n_u32
+#define v128_sr16                      vshrq_n_u16
+
+#define v128_sra64                     vshrq_n_s64
+#define v128_sra32                     vshrq_n_s32
+#define v128_sra16                     vshrq_n_s16
+
+// logical ops
+#define v128_or                        vorrq_u32
+#define v128_and                       vandq_u32
+#define v128_not                       vmvnq_u32
+#define v128_xor                       veorq_u32
+
+#define v128_xor3( v2, v1, v0 )        v128_xor( v2, v128_xor( v1, v0 ) )
+//#define v128_xor3                      veor3q_u32
+#define v128_nor                       vornq_u32
+#define v128_andnot( v1, v0 )          vandq_u32( vmvnq_u32(v1), v0 )
+#define v128_xorandnot( v2, v1, v0 )   v128_xor( v2, v128_andnot( v1, v0 ) )
+#define v128_and3( a, b, c )           v128_and( a, v128_and( b, c ) )
+#define v128_or3( a, b, c )            v128_or( a, v128_or( b, c ) )
+#define v128_xorand( a, b, c )         v128_xor( a, v128_and( b, c ) )
+#define v128_andxor( a, b, c )         v128_and( a, v128_xor( b, c ))
+#define v128_xoror( a, b, c )          v128_xor( a, v128_or( b, c ) )
+#define v128_orand( a, b, c )          v128_or( a, v128_and( b, c ) )
+#define v128_xnor( a, b )              v128_not( v128_xor( a, b ) )
+
+#define v128_alignr64                  vextq_u64
+#define v128_alignr32                  vextq_u32
+#define v128_alignr8                   vextq_u8 
+
+#define v128_unpacklo64                vtrn1q_u64
+#define v128_unpackhi64                vtrn2q_u64
+
+#define v128_unpacklo32                vtrn1q_u32
+#define v128_unpackhi32                vtrn2q_u32
+
+#define v128_unpacklo16                vtrn1q_u16
+#define v128_unpackhi16                vtrn2q_u16
+
+#define v128_unpacklo8                 vtrn1q_u8
+#define v128_unpackhi8                 vtrn2q_u8
+
+// AES
+// consistent with Intel AES, break up for optimizing
+#define v128_aesenc( v, k )            vaesmcq_u8( vaeseq_u8( v, k ) )
+#define v128_aesenclast( v, k )        vaeseq_u8( v, k )
+
+#define v128_aesdec( v, k )            vaesimcq_u8( vaesdq_u8( v, k ) )
+#define v128_aesdeclast( v, k )        vaesdq_u8( v, k )
+
+// pointer indexing
+#define casti_v128( p, i )             (((uint32x4_t*)(p))[i])
+
+#define cast_v128( p )                 (*((uint32x4_t*)(p)))
+
+
+// Many NEON instructions are sized when they don't need to be, for example
+// zero, which may cause the compiler to complain when the sizes don't match.
+// use "-flax_vector_conversions".
+
+#define u32_to_u64                     vreinterpretq_u64_u32
+#define u64_to_u32                     vreinterpretq_u32_u64
+
+#define u64_to_u8                      vreinterpretq_u8_u64
+#define u8_to_u64                      vreinterpretq_u64_u8
+
+#define u32_to_u8                      vreinterpretq_u8_u32
+#define u8_to_u32                      vreinterpretq_u32_u8
+
+#define v128_zero                      v128_64( 0ull )
+//#define v128_zero_fn()                 v128_64( 0ull )
+//#define v128_zero                      v128_zero_fn 
+
+// set1
+#define v128_32                        vmovq_n_u32
+#define v128_64                        vmovq_n_u64
+
+#define v128_set64( u64_1, u64_0 ) \
+   ( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) )
+#define v128_set_64                    v128_set64    // deprecated
+
+#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \
+    (uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \
+    | ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) )
+#define v128_set_32                    v128_set32  // deprecated
+
+
+static inline void v128_memset_zero( uint32x4_t *dst, const int n )
+{  for( int i = 0; i < n; i++ )     dst[n] = (uint32x4_t)(uint128_t)0; }
+
+static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src,
+                                 const int n )
+{  for( int i = 0; i < n; i++ )     dst[n] = src[n]; }
+   
+static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n )
+{  for ( int i = 0; i < n; i ++ )  dst[i] = src[i]; }
+
+// select src & dst lanes
+#define v128_mov32( dst, ld, src, ls )   vcopyq_laneq_u32( dst, ld, src, ls )
+
+// move src u64 to lane 0, neon needs a source vector to write into
+#define v128_mov64( u64 )              (uint64x2_t)(uint128_t)(u64)
+
+static inline uint64x2_t v128_negate64( uint64x2_t v )
+{   return v128_sub64( v128_xor( v, v ), v ); }
+
+static inline uint32x4_t v128_negate32( uint32x4_t v )
+{   return v128_sub32( v128_xor( v, v ), v ); }
+
+static inline uint16x8_t v128_negate16( uint16x8_t v )
+{   return v128_sub64( v128_xor( v, v ), v ); }
+
+#define v128_add4_32( v3, v2, v1, v0 ) \
+   vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
+
+// how to build a bitmask from vector elements?
+#define v128_movmask32                 _Static_assert (0, "No ARM target: v128_movmask32")
+#define v128_movmask64                 _Static_assert (0, "No ARM target: v128_movmask64")
+
+
+static inline uint64x2_t v128_ror64( uint64x2_t v, const int c )
+{   return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); }
+
+static inline uint64x2_t v128_rol64( uint64x2_t v, const int c )
+{   return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); } 
+
+static inline uint32x4_t v128_ror32( uint32x4_t v, const int c )
+{   return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); }
+
+static inline uint32x4_t v128_rol32( uint32x4_t v, const int c )
+{   return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); }
+
+static inline uint16x8_t v128_ror16( uint16x8_t v, const int c )
+{   return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); }
+
+static inline uint16x8_t v128_rol16( uint16x8_t v, const int c )
+{   return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); }
+
+// reverse endian byte order
+#define v128_bswap16(v)                u8_to_u16( vrev16q_u8( u16_to_u8(v) ))
+#define v128_bswap32(v)                u8_to_u32( vrev32q_u8( u32_to_u8(v) ))
+#define v128_bswap64(v)                u8_to_u64( vrev64q_u8( u64_to_u8(v) ))
+#define v128_bswap128(v)               v128_swap64( v128_bswap64(v) )
+
+#define v128_block_bswap32( dst, src ) \
+   casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \
+   casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \
+   casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \
+   casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \
+   casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \
+   casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \
+   casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \
+   casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) );
+
+#define v128_block_bswap64( dst, src ) \
+   dst[0] = v128_bswap64( src[0] ); \
+   dst[1] = v128_bswap64( src[1] ); \
+   dst[2] = v128_bswap64( src[2] ); \
+   dst[3] = v128_bswap64( src[3] ); \
+   dst[4] = v128_bswap64( src[4] ); \
+   dst[5] = v128_bswap64( src[5] ); \
+   dst[6] = v128_bswap64( src[6] ); \
+   dst[7] = v128_bswap64( src[7] );
+
+#define v128_rev32( v )                vrev64q_u32( v )
+
+static inline uint32x4_t v128_swap64( uint32x4_t v )
+{   return vextq_u64( v, v, 1 ); }
+
+static inline uint32x4_t v128_swap32( uint32x4_t v )
+{   return vextq_u32( v, v, 2 ); }
+
+static inline uint32x4_t v128_shuflr32( uint32x4_t v )
+{   return vextq_u32( v, v, 1 ); }
+
+static inline uint32x4_t v128_shufll32( uint32x4_t v )
+{   return vextq_u32( v, v, 3 ); }
+
+#define v128_swap64_32(v)              v128_ror64( v, 32 )
+#define v128_shuflr64_24(v)            v128_ror64( v, 24 ) 
+#define v128_shuflr64_16(v)            v128_ror64( v, 16 )
+
+#define v128_swap32_16(v)              v128_ror32( v, 16 )
+#define v128_shuflr32_8(v)             v128_ror32( v,  8 )
+
+// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask.
+#define v128_blend16( v1, v0, mask ) \
+   v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) )
+
+#endif
diff --git a/sysinfos.c b/sysinfos.c
index aebb069..3b8f06a 100644
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -13,9 +13,15 @@
 #include <ctype.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include "miner.h"
 
+#if defined(__aarch64__)
+// for arm's "cpuid"
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#endif
+
 #ifndef WIN32
 
 // 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input
@@ -282,10 +288,11 @@ static inline int cpu_fanpercent()
 #define AVX512_mask  (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)
 
 
-#ifndef __arm__
+#if defined(__x86_64__)
 static inline void cpuid( unsigned int leaf, unsigned int subleaf,
                           unsigned int output[4] )
 {
+
 #if defined (_MSC_VER) || defined (__INTEL_COMPILER)
    // Microsoft or Intel compiler, intrin.h included
    __cpuidex(output, leaf, subleaf );
@@ -313,7 +320,16 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
    }
 #endif
 }
-#else /* !__arm__ */
+
+#elif defined(__aarch64__)
+
+static inline void cpuid( unsigned int leaf, unsigned int subleaf,
+                          unsigned int output[4] )
+{
+    output[0] = getauxval(AT_HWCAP);
+}   
+
+#else
 #define cpuid(leaf, subleaf, out) out[0] = 0;
 #endif
 
@@ -421,6 +437,32 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #endif
 }
  
+/*
+#ifdef __aarch64__
+#warning "__aarch64__"
+#endif
+#ifdef __ARM_ARCH
+#warning "__ARM_ARCH " __ARM_ARCH
+#endif
+#ifdef __ARM_NEON
+#warning "__ARM_NEON"
+#endif
+#ifdef __ARM_FEATURE_CRYPTO
+#warning "___ARM_FEATURE_CRYPTO"
+#endif
+#ifdef __ARM_FEATURE_AES
+#warning "___ARM_FEATURE_AES"
+#endif
+#ifdef __ARM_FEATURE_SHA2
+#warning "___ARM_FEATURE_SHA2"
+#endif
+#ifdef __ARM_FEATURE_SHA3
+#warning "___ARM_FEATURE_SHA3"
+#endif
+*/
+
+
+
 // Typical display format: AVX10.[version]_[vectorlength], if vector length is
 // omitted 256 is the default.
 //    Ex: AVX10.1_512
@@ -431,23 +473,42 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 //   1     1    1    1    = AVX10 512 bit max  (version 1 granite rapids)
 // Other combinations are not defined.
 
-// Test AVX10_flag before AVX10_FEATURES flags.
+// No technical need for this, the code won't run if false.
+static inline bool cpu_arch_x86_64()
+{
+#if defined(__x86_64__)
+   return true;
+#else
+   return false;
+#endif
+}
+
+static inline bool cpu_arch_aarch64()
+{
+#if defined(__aarch64__)
+   return true;
+#else
+   return false;
+#endif
+}   
+
 static inline bool has_avx10()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__)
+
     unsigned int cpu_info[4] = { 0 };
     cpuid( EXTENDED_FEATURES, 1, cpu_info );
     return cpu_info[ EDX_Reg ] & AVX10_Flag;
+
+#else
+    return false;
 #endif
 }
 
 static inline unsigned int avx10_version()
 {
-#ifdef __arm__
-    return 0;
-#else
+#if defined(__x86_64__)
+
     if ( has_avx10() )
     {
        unsigned int cpu_info[4] = { 0 };
@@ -455,14 +516,16 @@ static inline unsigned int avx10_version()
        return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
     }
     return 0;
+
+#else
+    return 0;
 #endif
 }
 
 static inline bool has_avx10_512()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__)
+
     if ( has_avx10() )
     {
        unsigned int cpu_info[4] = { 0 };
@@ -470,14 +533,16 @@ static inline bool has_avx10_512()
        return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
     }
     return false;
+
+#else
+    return false;
 #endif
 }
 
 static inline bool has_avx10_256()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__)
+
     if ( has_avx10() )
     {
        unsigned int cpu_info[4] = { 0 };
@@ -485,15 +550,17 @@ static inline bool has_avx10_256()
        return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
     }
     return false;
+
+#else
+    return false;
 #endif
 }
 
 // Maximum vector length
 static inline unsigned int avx10_vector_length()
 {
-#ifdef __arm__
-    return 0;
-#else
+#if defined(__x86_64__)
+
     if ( has_avx10() )
     {
        unsigned int cpu_info[4] = { 0 };
@@ -502,222 +569,288 @@ static inline unsigned int avx10_vector_length()
           : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
     }
     return 0;
+
+#else
+    return 0;
 #endif
 }    
 
+static inline bool has_neon()
+{
+#if defined(__aarch64__)
+    unsigned int cpu_info[4] = { 0 };
+    return cpu_info[0];
+#else
+    return false;
+#endif
+}
+   
 static inline bool has_sha()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__) && defined(__SSE2__)
+
     unsigned int cpu_info[4] = { 0 };
     cpuid( EXTENDED_FEATURES, 0, cpu_info );
     return cpu_info[ EBX_Reg ] & SHA_Flag;
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( 0, 0, cpu_info );
+    return cpu_info[0] & HWCAP_SHA2;
+
+#else
+    return false;
 #endif
 }
 
 static inline bool has_sha512()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__) && defined(__AVX2__)
+
     unsigned int cpu_info[4] = { 0 };
     cpuid( EXTENDED_FEATURES, 1, cpu_info );
     return cpu_info[ EAX_Reg ] & SHA512_Flag;
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( 0, 0, cpu_info );
+    return cpu_info[0] & HWCAP_SHA3;
+
+#else
+    return false;
 #endif
 }
 
 static inline bool has_sse2()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__)
+
     unsigned int cpu_info[4] = { 0 };
     cpuid( CPU_INFO, 0, cpu_info );
     return cpu_info[ EDX_Reg ] & SSE2_Flag;
+
+#else
+    return false;
 #endif
 }
 
-// nehalem and above, no AVX on nehalem
 static inline bool has_aes_ni()
 {
-#ifdef __arm__
-	return false;
+#if defined(__x86_64__) && defined(__SSE2__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( CPU_INFO, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & AES_NI_Flag;
+   
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( 0, 0, cpu_info );
+   return cpu_info[0] & HWCAP_AES;
+
 #else
-	unsigned int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, 0, cpu_info );
-	return cpu_info[ ECX_Reg ] & AES_NI_Flag;
+   return false;
 #endif
 }
 
-// westmere and above
 static inline bool has_avx()
 {
-#ifdef __arm__
-        return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( CPU_INFO, 0, cpu_info );
+   return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask );
+
 #else
-        unsigned int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, 0, cpu_info );
-        return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask );
+   return false;
 #endif
 }
 
-// haswell and above
 static inline bool has_avx2()
 {
-#ifdef __arm__
-    return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EBX_Reg ] & AVX2_Flag;
+
 #else
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX2_Flag;
+   return false;
 #endif
 }
 
 static inline bool has_avx512f()
 {
-#ifdef __arm__
-    return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EBX_Reg ] & AVX512_F_Flag;
 #else
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512_F_Flag;
+   return false;
 #endif
 }
 
 static inline bool has_avx512dq()
 {
-#ifdef __arm__
-    return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag;
 #else
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag;
+   return false;
 #endif
 }
 
 static inline bool has_avx512bw()
 {
-#ifdef __arm__
-    return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EBX_Reg ] & AVX512_BW_Flag;
 #else
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512_BW_Flag;
+   return false;
 #endif
 }
 
 static inline bool has_avx512vl()
 {
-#ifdef __arm__
-    return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return cpu_info[ EBX_Reg ] & AVX512_VL_Flag;
 #else
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512_VL_Flag;
+   return false;
 #endif
 }
 
-// Minimum to be useful
 static inline bool has_avx512()
 {
-#ifdef __arm__
-    return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_FEATURES, 0, cpu_info );
+   return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask );
+
 #else
-    unsigned int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, 0, cpu_info );
-    return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask );
+   return false;    
 #endif
 }
 
 static inline bool has_vaes()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__)
+
     unsigned int cpu_info[4] = { 0 };
     cpuid( EXTENDED_FEATURES, 0, cpu_info );
     return cpu_info[ ECX_Reg ] & VAES_Flag;
+
+#else
+   return false;
 #endif
 }
 
 static inline bool has_vbmi()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__)
+
     unsigned int cpu_info[4] = { 0 };
     cpuid( EXTENDED_FEATURES, 0, cpu_info );
     return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
+
+#else
+   return false;
 #endif
 }
 
 static inline bool has_vbmi2()
 {
-#ifdef __arm__
-    return false;
-#else
+#if defined(__x86_64__)
+
     unsigned int cpu_info[4] = { 0 };
     cpuid( EXTENDED_FEATURES, 0, cpu_info );
     return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
+#else
+    return false;
 #endif
 }
 
-// AMD only
+// Obsolete, AMD only
 static inline bool has_xop()
 {
-#ifdef __arm__
-        return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( EXTENDED_CPU_INFO, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & XOP_Flag;
 #else
-        unsigned int cpu_info[4] = { 0 };
-        cpuid( EXTENDED_CPU_INFO, 0, cpu_info );
-        return cpu_info[ ECX_Reg ] & XOP_Flag;
+   return false;
 #endif
 }
 
 static inline bool has_fma3()
 {
-#ifdef __arm__
-        return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( CPU_INFO, 0, cpu_info );
+   return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask );
+
 #else
-        unsigned int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, 0, cpu_info );
-        return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask );
+   return false;
 #endif
 }
 
 static inline bool has_sse42()
 {
-#ifdef __arm__
-        return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( CPU_INFO, 0, cpu_info );
+   return cpu_info[ ECX_Reg ] & SSE42_Flag;
+
 #else
-        unsigned int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, 0, cpu_info );
-        return cpu_info[ ECX_Reg ] & SSE42_Flag;
+   return false;
 #endif
 }
 
 static inline bool has_sse()
 {
-#ifdef __arm__
-        return false;
+#if defined(__x86_64__)
+
+   unsigned int cpu_info[4] = { 0 };
+   cpuid( CPU_INFO, 0, cpu_info );
+   return cpu_info[ EDX_Reg ] & SSE_Flag;
+
 #else
-        unsigned int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, 0, cpu_info );
-        return cpu_info[ EDX_Reg ] & SSE_Flag;
+   return false;
 #endif
 }
 
 static inline uint32_t cpuid_get_highest_function_number()
 {
+#if defined(__x86_64__)
+ 
   unsigned int cpu_info[4] = {0};
   cpuid( VENDOR_ID, 0, cpu_info);
   return cpu_info[ EAX_Reg ];
+
+#else
+  return 0;  
+#endif
 }
 
+// out of date
 static inline void cpuid_get_highest_function( char* s )
 {
-  uint32_t fn = cpuid_get_highest_function_number();
+#if defined(__x86_64__)
+
+   uint32_t fn = cpuid_get_highest_function_number();
   switch (fn)
   {
     case 0x16:
@@ -735,11 +868,16 @@ static inline void cpuid_get_highest_function( char* s )
     default:
       sprintf( s, "undefined %x", fn );
   }
+
+#else
+  s = NULL;
+#endif
 }
 
+// out of date
 static inline void cpu_bestfeature(char *outbuf, size_t maxsz)
 {
-#ifdef __arm__
+#if defined(__arm__) || defined(__aarch64__)
 	sprintf(outbuf, "ARM");
 #else
 	int cpu_info[4] = { 0 };
@@ -769,9 +907,8 @@ static inline void cpu_bestfeature(char *outbuf, size_t maxsz)
 
 static inline void cpu_brand_string( char* s )
 {
-#ifdef __arm__
-        sprintf( s, "ARM" );
-#else
+#if defined(__x86_64__)
+
     int cpu_info[4] = { 0 };
     cpuid( VENDOR_ID, 0, cpu_info );
     if ( cpu_info[ EAX_Reg ] >= 4 )
@@ -783,6 +920,15 @@ static inline void cpu_brand_string( char* s )
         cpuid( CPU_BRAND_3, 0, cpu_info );
         memcpy( s + 32, cpu_info, sizeof(cpu_info) );
     }
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+    sprintf( s, "ARM" );
+
+#else
+
+    sprintf( s, "unknown CPU architecture" );
+
 #endif
 }    
 
diff --git a/util.c b/util.c
index 591ecbe..ae64364 100644
--- a/util.c
+++ b/util.c
@@ -755,9 +755,9 @@ void memrev(unsigned char *p, size_t len)
 {
    if ( len == 32 )
    {
-      __m128i *pv = (__m128i*)p;
-      __m128i t = mm128_bswap_128( pv[0] );
-      pv[0] =     mm128_bswap_128( pv[1] );   
+      v128_t *pv = (v128_t*)p;
+      v128_t t = v128_bswap128( pv[0] );
+      pv[0] =     v128_bswap128( pv[1] );   
       pv[1] = t;
    }
    else
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index 336e3c7..9f1721e 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -38,7 +38,7 @@ cp $MINGW_LIB/zlib1.dll release/
 cp $MINGW_LIB/libwinpthread-1.dll release/
 cp $GCC_MINGW_LIB/libstdc++-6.dll release/
 cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
-cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
+cp ./../libcrypto-1_1-x64.dll release/
 cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 
 # Start building...