v25.1

2025-09-17 23:44:27 +00:00 · 2024-12-30 21:33:04 -05:00
parent a45a333b40
commit 1d9341ee92
37 changed files with 249 additions and 2644 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,15 +5,31 @@ else
 JANSSON_INCLUDES=
 endif
-EXTRA_DIST	= example-cfg.json nomacro.pl
+# Hook for for GMP on MacOS which is provided by homebrew.
 # Homebrew has different linkage on x86_64 & ARM64.
 # Need complex expressions, nesting or elseif, none seem to work.
 if !HAVE_APPLE
  GMP_INCLUDES =
  GMP_LIB = -lgmp
 endif
 if ARM64_APPLE
  GMP_INCLUDES = -I/opt/homebrew/include
  GMP_LIB = /opt/homebrew/lib/libgmp.a
 endif
 if X86_64_APPLE
  GMP_INCLUDES = -I/usr/local/include
  GMP_LIB = /usr/local/lib/libgmp.a
 endif
-SUBDIRS		= compat
+EXTRA_DIST = example-cfg.json nomacro.pl
-ALL_INCLUDES	= @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
+SUBDIRS = compat
-bin_PROGRAMS	= cpuminer
+ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) $(GMP_INCLUDES) -I.
-dist_man_MANS	= cpuminer.1
+bin_PROGRAMS = cpuminer
 dist_man_MANS = cpuminer.1
 cpuminer_SOURCES = \
  dummy.cpp \
@@ -166,8 +182,6 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
  algo/shavite/shavite-hash-4way.c \
  algo/simd/nist.c \
  algo/simd/vector.c \
  algo/simd/sph_simd.c \
  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
@@ -287,15 +301,10 @@ if HAVE_WINDOWS
   cpuminer_SOURCES += compat/winansi.c
 endif
-cpuminer_LDFLAGS	= @LDFLAGS@
+cpuminer_LDFLAGS = @LDFLAGS@
-cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@
+cpuminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ $(GMP_LIB)
 cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
-cpuminer_CFLAGS   = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
+cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
 # Linking GMP fails on MacOS
 if !HAVE_APPLE
   cpuminer_LDADD += -lgmp
 endif
 if ARCH_ARM64
   cpuminer_CFLAGS += -flax-vector-conversions
--- a/7
+++ b/7
@@ -75,6 +75,13 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v25.1
 MacOS ARM64: m7m algo is now working.
 MacOS ARM64: can now be compiled with GCC.
 MacOS x86_64: is now working compiled with GCC.
 Fixed some minor bugs & removed some obsolete code.
 v24.8
 ARM: Apple MacOS on M series CPU is now supported compiled from source
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
 static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
                                 size_t byte_len, size_t lim )
 {
-    unsigned eb;
+    __m512i tmp[lim + 1] __attribute__ ((aligned (64)));
    union {
       __m512i tmp[lim + 1];
       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m512_len = byte_len >> 3;
    const unsigned eb = hard_coded_eb;
    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm512_set1_epi64( t );
+        tmp[0] = _mm512_set1_epi64( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm512_set1_epi64( eb );
+        tmp[0] = _mm512_set1_epi64( eb );
-        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
+        memset_zero_512( tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
+        tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
    }
-    keccak64_8way_core( kc, u.tmp, j, lim );
+    keccak64_8way_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
 static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
            size_t lim )
 {
-    unsigned eb;
+    __m256i tmp[lim + 1] __attribute__ ((aligned (32)));
    union {
       __m256i tmp[lim + 1];
       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m256_len = byte_len >> 3;
    const unsigned eb = hard_coded_eb;
    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm256_set1_epi64x( t );
+        tmp[0] = _mm256_set1_epi64x( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set1_epi64x( eb );
+        tmp[0] = _mm256_set1_epi64x( eb );
-        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
+        memset_zero_256( tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
+        tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
    }
-    keccak64_core( kc, u.tmp, j, lim );
+    keccak64_core( kc, tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
--- a/algo/m7m/m7m.c
+++ b/algo/m7m/m7m.c
@@ -1,8 +1,6 @@
 #include "cpuminer-config.h"
 #include "algo-gate-api.h"
 #if !defined(__APPLE__)
 #include <gmp.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
        return exp( xt );
 }
 /*
 static inline double exp_n2( double x1, double x2 )
 {
    double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
    else if ( xt > p6 - 1.e-200 )
        return 0.;
 }
 */
 double swit2_( double wvnmb )
 {
@@ -298,14 +298,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    return 0;
 }
 #endif   // not apple
 bool register_m7m_algo( algo_gate_t *gate )
 {
 #if defined(__APPLE__)
  applog( LOG_ERR, "M7M algo is not supported on MacOS");
  return false;
 #else  
  gate->optimizations = SHA256_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)&scanhash_m7m_hash;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  opt_target_factor = 65536.0;
  return true;
 #endif
 }
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -11,7 +11,6 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
    cubehashParam           cube;
    cube_2way_context       cube2;
    sph_shavite512_context  shavite;
-    hashState_sd            sd;
+    simd512_context         simd;
    shavite512_2way_context shavite2;
-    simd_2way_context       simd;
+    simd_2way_context       simd_2way;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
    shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
-    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
-    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
+    simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
    rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );     
@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
       echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                       (const BitSequence *)hash0, 64 );
    else
-    {
+       simd512_ctx( &ctx.simd, hash0, hash0, 64 );
       init_sd( &ctx.sd, 512 );
       update_final_sd( &ctx.sd, (BitSequence *)hash0,
                           (const BitSequence *)hash0, 512 );
    }
   if ( hash1[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash1, 512,
                       (const BitSequence *)hash1, 64 );
   else
-   {
+       simd512_ctx( &ctx.simd, hash1, hash1, 64 );
       init_sd( &ctx.sd, 512 );
       update_final_sd( &ctx.sd, (BitSequence *)hash1,
                           (const BitSequence *)hash1, 512 );
   }
   if ( hash2[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash2, 512,
                       (const BitSequence *)hash2, 64 );
   else
-   {
+       simd512_ctx( &ctx.simd, hash2, hash2, 64 );
       init_sd( &ctx.sd, 512 );
       update_final_sd( &ctx.sd, (BitSequence *)hash2,
                           (const BitSequence *)hash2, 512 );
   }
   if ( hash3[0] & mask ) //4
       echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                       (const BitSequence *)hash3, 64 );
   else
-   {
+       simd512_ctx( &ctx.simd, hash3, hash3, 64 );
       init_sd( &ctx.sd, 512 );
       update_final_sd( &ctx.sd, (BitSequence *)hash3,
                           (const BitSequence *)hash3, 512 );
   }
   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -46,7 +46,7 @@
 #endif
 #ifdef __GNUC__
-#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
+#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
 #define ASM 0
 #else
 #define ASM 1
--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -1,472 +0,0 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "nist.h"
 #include "simd_iv.h"
 /* #define NO_PRECOMPUTED_IV */
 #if defined(__SSE2__)  // || defined(__ARM_NEON)
 /* 
 * Increase the counter.
 */
 void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
 #ifdef HAS_64
      state->count += databitlen;
 #else
      uint32_t old_count = state->count_low;
      state->count_low += databitlen;
      if (state->count_low < old_count)
        state->count_high++;
 #endif
 }
 /* 
 * Initialize the hashState_sd with a given IV.
 * If the IV is NULL, initialize with zeros.
 */
 int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
  int n = 8;
  state->hashbitlen = hashbitlen;
  state->n_feistels = n;
  state->blocksize = 128*8;
 #ifdef HAS_64
  state->count = 0;
 #else
  state->count_low  = 0;
  state->count_high = 0;
 #endif  
 //  state->buffer = malloc(16*n + 16);
  /*
   * Align the buffer to a 128 bit boundary.
   */
 //  state->buffer += ((unsigned char*)NULL - state->buffer)&15;
 //  state->A = malloc((4*n+4)*sizeof(u32));
  /*
   * Align the buffer to a 128 bit boundary.
   */
 //  state->A += ((u32*)NULL - state->A)&3;
  state->B = state->A+n;
  state->C = state->B+n;
  state->D = state->C+n;
  if (IV)
    memcpy(state->A, IV, 4*n*sizeof(u32));
  else
    memset(state->A, 0, 4*n*sizeof(u32));
   // free(state->buffer);
  //  free(state->A);	
  return 0;
 }
 /* 
 * Initialize the hashState_sd.
 */
 int init_sd(hashState_sd *state, int hashbitlen) {
  int r;
  char *init;
 #ifndef NO_PRECOMPUTED_IV
 //  if (hashbitlen == 224)
 //    r=InitIV(state, hashbitlen, IV_224);
 //  else if (hashbitlen == 256)
 //    r=InitIV(state, hashbitlen, IV_256);
 //  else if (hashbitlen == 384)
 //    r=InitIV(state, hashbitlen, IV_384);
 //  else
  if (hashbitlen == 512)
    r = InitIV(state, hashbitlen, IV_512);
  else
 #endif
    {
      /* 
       * Nonstandart length: IV is not precomputed.
       */
      r=InitIV(state, hashbitlen, NULL);
      if (r != 0)
        return r;
      init = malloc(state->blocksize);
      memset(init, 0, state->blocksize);
 #if defined __STDC__ && __STDC_VERSION__ >= 199901L
      snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
 #else
      sprintf(init, "SIMD-%i v1.1", hashbitlen);
 #endif
      SIMD_Compress(state, (unsigned char*) init, 0);
      free(init);
    }
  return r;
 }
 int update_sd( hashState_sd *state, const BitSequence *data,
                      DataLength databitlen )
 {
  unsigned current;
  unsigned int bs = state->blocksize;
  static int align = -1;
  if (align == -1)
    align = RequiredAlignment();
 #ifdef HAS_64
  current = state->count & (bs - 1);
 #else
  current = state->count_low & (bs - 1);
 #endif
  if ( current & 7 )
  {
    // The number of hashed bits is not a multiple of 8.
    // Very painfull to implement and not required by the NIST API.
    return 1;
  }
  while ( databitlen > 0 )
  {
    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
    {
       // We can hash the data directly from the input buffer.
      SIMD_Compress(state, data, 0);
      databitlen -= bs;
      data += bs/8;
      IncreaseCounter(state, bs);
    }
    else
    {
       // Copy a chunk of data to the buffer
      unsigned int len = bs - current;
      if ( databitlen < len )
      {
        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
        IncreaseCounter( state, databitlen );        
        return 0;
      }
      else
      {
        memcpy( state->buffer+current/8, data, len/8 );
        IncreaseCounter( state,len );
        databitlen -= len;
        data += len/8;
        current = 0;
        SIMD_Compress( state, state->buffer, 0 );
      }
    }
  }
  return 0;
 }
 int final_sd( hashState_sd *state, BitSequence *hashval )
 {
 #ifdef HAS_64
  uint64_t l;
  int current = state->count & (state->blocksize - 1);
 #else
  uint32_t l;
  int current = state->count_low & (state->blocksize - 1);
 #endif
  unsigned int i;
  BitSequence bs[64];
  int isshort = 1;
  // If there is still some data in the buffer, hash it
  if ( current )
  {
    // We first need to zero out the end of the buffer.
    if ( current & 7 )
    {
      BitSequence mask = 0xff >> ( current & 7 );
      state->buffer[current/8] &= ~mask;
    }
    current = ( current+7 ) / 8;
    memset( state->buffer+current, 0, state->blocksize/8 - current );
    SIMD_Compress( state, state->buffer, 0 );
  }
  //* Input the message length as the last block
  memset( state->buffer, 0, state->blocksize / 8 );
 #ifdef HAS_64
  l = state->count;
  for ( i=0; i<8; i++ )
  {
    state->buffer[i] = l & 0xff;
    l >>= 8;
  }
  if ( state->count < 16384 )
    isshort = 2;
 #else
  l = state->count_low;
  for ( i=0; i<4; i++ )
  {
    state->buffer[i] = l & 0xff;
    l >>= 8;
  }
  l = state->count_high;
  for ( i=0; i<4; i++ )
  {
    state->buffer[4+i] = l & 0xff;
    l >>= 8;
  }
  if ( state->count_high == 0 && state->count_low < 16384 )
    isshort = 2;
 #endif
  SIMD_Compress( state, state->buffer, isshort );
  // Decode the 32-bit words into a BitSequence
  for ( i=0; i < 2*state->n_feistels; i++ )
  {
    u32 x = state->A[i];
    bs[4*i  ] = x&0xff;
    x >>= 8;
    bs[4*i+1] = x&0xff;
    x >>= 8;
    bs[4*i+2] = x&0xff;
    x >>= 8;
    bs[4*i+3] = x&0xff;
  }
  memcpy( hashval, bs, state->hashbitlen / 8 );
  if ( state->hashbitlen % 8 )
  {
    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
    hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
  }
  return 0;
 }
 int update_final_sd( hashState_sd *state, BitSequence *hashval,
                            const BitSequence *data, DataLength databitlen )
 {
  int current, i;
  unsigned int bs = state->blocksize;
  static int align = -1;
  BitSequence out[64];
  int isshort = 1;
  uint64_t l;
  if (align == -1)
    align = RequiredAlignment();
 #ifdef HAS_64
  current = state->count & (bs - 1);
 #else
  current = state->count_low & (bs - 1);
 #endif
  if ( current & 7 )
  {
    // The number of hashed bits is not a multiple of 8.
    // Very painfull to implement and not required by the NIST API.
    return 1;
  }
  while ( databitlen > 0 )
  {
    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
    {
       // We can hash the data directly from the input buffer.
      SIMD_Compress(state, data, 0);
      databitlen -= bs;
      data += bs/8;
      IncreaseCounter(state, bs);
    }
    else
    {
       // Copy a chunk of data to the buffer
      unsigned int len = bs - current;
      if ( databitlen < len )
      {
        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
        IncreaseCounter( state, databitlen );
        break;
      }
      else
      {
        memcpy( state->buffer+current/8, data, len/8 );
        IncreaseCounter( state,len );
        databitlen -= len;
        data += len/8;
        current = 0;
        SIMD_Compress( state, state->buffer, 0 );
      }
    }
  }
  current = state->count & (state->blocksize - 1);
  // If there is still some data in the buffer, hash it
  if ( current )
  {
    // We first need to zero out the end of the buffer.
    if ( current & 7 )
    {
      BitSequence mask = 0xff >> ( current & 7 );
      state->buffer[current/8] &= ~mask;
    }
    current = ( current+7 ) / 8;
    memset( state->buffer+current, 0, state->blocksize/8 - current );
    SIMD_Compress( state, state->buffer, 0 );
  }
  //* Input the message length as the last block
  memset( state->buffer, 0, state->blocksize / 8 );
  l = state->count;
  for ( i=0; i<8; i++ )
  {
    state->buffer[i] = l & 0xff;
    l >>= 8;
  }
  if ( state->count < 16384 )
    isshort = 2;
  SIMD_Compress( state, state->buffer, isshort );
  // Decode the 32-bit words into a BitSequence
  for ( i=0; i < 2*state->n_feistels; i++ )
  {
    u32 x = state->A[i];
    out[4*i  ] = x & 0xff;
    x >>= 8;
    out[4*i+1] = x & 0xff;
    x >>= 8;
    out[4*i+2] = x & 0xff;
    x >>= 8;
    out[4*i+3] = x & 0xff;
  }
  memcpy( hashval, out, state->hashbitlen / 8 );
  if ( state->hashbitlen % 8 )
  {
    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
  }
  return 0;
 }
 int simd_full( hashState_sd *state, BitSequence *hashval,
                            const BitSequence *data, DataLength databitlen )
 {
  InitIV( state, 512, IV_512 );
  int current, i;
  unsigned int bs = state->blocksize;
  static int align = -1;
  BitSequence out[64];
  int isshort = 1;
  uint64_t l;
  if (align == -1)
    align = RequiredAlignment();
 #ifdef HAS_64
  current = state->count & (bs - 1);
 #else
  current = state->count_low & (bs - 1);
 #endif
  if ( current & 7 )
  {
    // The number of hashed bits is not a multiple of 8.
    // Very painfull to implement and not required by the NIST API.
    return 1;
  }
  while ( databitlen > 0 )
  {
    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
    {
       // We can hash the data directly from the input buffer.
      SIMD_Compress(state, data, 0);
      databitlen -= bs;
      data += bs/8;
      IncreaseCounter(state, bs);
    }
    else
    {
       // Copy a chunk of data to the buffer
      unsigned int len = bs - current;
      if ( databitlen < len )
      {
        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
        IncreaseCounter( state, databitlen );
        break;
      }
      else
      {
        memcpy( state->buffer+current/8, data, len/8 );
        IncreaseCounter( state,len );
        databitlen -= len;
        data += len/8;
        current = 0;
        SIMD_Compress( state, state->buffer, 0 );
      }
    }
  }
  current = state->count & (state->blocksize - 1);
  // If there is still some data in the buffer, hash it
  if ( current )
  {
    // We first need to zero out the end of the buffer.
    if ( current & 7 )
    {
      BitSequence mask = 0xff >> ( current & 7 );
      state->buffer[current/8] &= ~mask;
    }
    current = ( current+7 ) / 8;
    memset( state->buffer+current, 0, state->blocksize/8 - current );
    SIMD_Compress( state, state->buffer, 0 );
  }
  //* Input the message length as the last block
  memset( state->buffer, 0, state->blocksize / 8 );
  l = state->count;
  for ( i=0; i<8; i++ )
  {
    state->buffer[i] = l & 0xff;
    l >>= 8;
  }
  if ( state->count < 16384 )
    isshort = 2;
  SIMD_Compress( state, state->buffer, isshort );
  // Decode the 32-bit words into a BitSequence
  for ( i=0; i < 2*state->n_feistels; i++ )
  {
    u32 x = state->A[i];
    out[4*i  ] = x & 0xff;
    x >>= 8;
    out[4*i+1] = x & 0xff;
    x >>= 8;
    out[4*i+2] = x & 0xff;
    x >>= 8;
    out[4*i+3] = x & 0xff;
  }
  memcpy( hashval, out, state->hashbitlen / 8 );
  if ( state->hashbitlen % 8 )
  {
    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
  }
  return 0;
 }
 #endif
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -1,64 +0,0 @@
 #ifndef __NIST_H__
 #define __NIST_H__
 /*define data alignment for different C compilers*/
 #if defined(__GNUC__)
 #define DATA_ALIGN(x) x __attribute__((aligned(16)))
 #else
 #define DATA_ALIGN(x) __declspec(align(16)) x
 #endif
 #include "simd-compat.h"
 #include "compat/sha3-defs.h"
 /*
 * NIST API Specific types.
 */
 typedef struct {
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
 #ifdef HAS_64
  uint64_t count;
 #else
  uint32_t count_low;
  uint32_t count_high;
 #endif
  DATA_ALIGN(uint32_t A[32]);
  uint32_t *B;
  uint32_t *C;
  uint32_t *D;
  DATA_ALIGN(unsigned char buffer[128]);
 } hashState_sd;
 /* 
 * NIST API
 */
 int init_sd(hashState_sd *state, int hashbitlen);
 int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
 int final_sd(hashState_sd *state, BitSequence *hashval);
 int update_final_sd( hashState_sd *state, BitSequence *hashval,
                            const BitSequence *data, DataLength databitlen );
 int simd_full( hashState_sd *state, BitSequence *hashval,
               const BitSequence *data, DataLength databitlen );
 /* 
 * Internal API
 */
 //int SupportedLength(int hashbitlen);
 int RequiredAlignment(void);
 void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
 void fft128_natural(fft_t *a, unsigned char *x);
 void fft256_natural(fft_t *a, unsigned char *x);
 #endif
--- a/algo/simd/simd-compat.h
+++ b/algo/simd/simd-compat.h
@@ -1,198 +0,0 @@
 #ifndef __SIMD_COMPAT_H__
 #define __SIMD_COMPAT_H__
 #include <limits.h>
 /* 
 * This file desfines some helper function for cross-platform compatibility.
 */
 #if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
 #define GNU_EXT
 #endif
 /*
 * First define some integer types.
 */
 #if defined __STDC__ && __STDC_VERSION__ >= 199901L
 /*
 * On C99 implementations, we can use <stdint.h> to get an exact 32-bit
 * type, if any, or otherwise use a wider type.
 */
 #include <stdint.h>
 #include "compat/brg_types.h"
 #define C32(x)    ((u32)(x))
 #define HAS_64  1
 #else
 /*
 * On non-C99 systems, we use "unsigned int" if it is wide enough,
 * "unsigned long" otherwise. This supports all "reasonable" architectures.
 * We have to be cautious: pre-C99 preprocessors handle constants
 * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
 */
 #if ((UINT_MAX >> 11) >> 11) >= 0x3FF
 typedef unsigned int u32;
 #define C32(x)    ((u32)(x ## U))
 #else
 typedef unsigned long u32;
 #define C32(x)    ((u32)(x ## UL))
 #endif
 /*
 * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
 * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
 * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
 * test whether "unsigned long long" is available; we also know that
 * gcc features this type, even if the libc header do not know it.
 */
 #if ((ULONG_MAX >> 31) >> 31) >= 3
 typedef unsigned long u64;
 #define HAS_64  1
 #elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
 typedef unsigned long long u64;
 #define HAS_64  1
 #else
 /*
 * No 64-bit type...
 */
 #endif
 #endif
 /*
 * fft_t should be at least 16 bits wide.
 * using short int will require less memory, but int is faster...
 */
 typedef int fft_t;
 /*
 * Implementation note: some processors have specific opcodes to perform
 * a rotation. Recent versions of gcc recognize the expression above and
 * use the relevant opcodes, when appropriate.
 */
 #define T32(x)    ((x) & C32(0xFFFFFFFF))
 #define ROTL32(x, n)   T32(((x) << (n)) | ((x) >> (32 - (n))))
 #define ROTR32(x, n)   ROTL32(x, (32 - (n)))
 /*
 * The macro MAYBE_INLINE expands to an inline qualifier, is available.
 */
 #if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
 #define MAYBE_INLINE static inline
 #elif defined _MSC_VER
 #define MAYBE_INLINE __inline
 #else
 #define MAYBE_INLINE
 #endif
 /*  */
 #if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
 #define rdtsc()                                                         \
  ({                                                                    \
    u32 lo, hi;                                                         \
    __asm__ __volatile__ (      /* serialize */                         \
                          "xorl %%eax,%%eax \n        cpuid"            \
                          ::: "%rax", "%rbx", "%rcx", "%rdx");          \
    /* We cannot use "=A", since this would use %rax on x86_64 */       \
    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));              \
    (u64)hi << 32 | lo;                                                 \
  })                                                                    \
 #elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
 #define rdtsc __rdtsc
 #endif
 /* 
 * The IS_ALIGNED macro tests if a char* pointer is aligned to an
 * n-bit boundary.
 * It is defined as false on unknown architectures.
 */
 #define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
 #if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
 /*
 * Unaligned 32-bit access are not expensive on x86 so we don't care
 */
 #define IS_ALIGNED(p,n)    (n<=4 || CHECK_ALIGNED(p,n))
 #elif defined __sparcv9 || defined __sparc || defined __arm || \
      defined __ia64 || defined __ia64__ || \
      defined __itanium__ || defined __M_IA64 || \
      defined __powerpc__ || defined __powerpc
 #define IS_ALIGNED(p,n)    CHECK_ALIGNED(p,n)
 #else
 /* 
 * Unkonwn architecture: play safe
 */
 #define IS_ALIGNED(p,n)    0
 #endif
 /* checks for endianness */
 #if defined (__linux__) || defined (__GLIBC__)
 #  include <endian.h>
 #elif defined (__FreeBSD__)
 #  include <machine/endian.h> 
 #elif defined (__OpenBSD__)
 #  include <sys/endian.h>
 #endif
 #ifdef __BYTE_ORDER
 #  if __BYTE_ORDER == __LITTLE_ENDIAN
 #    define SIMD_LITTLE_ENDIAN
 #  elif __BYTE_ORDER == __BIG_ENDIAN
 #    define SIMD_BIG_ENDIAN
 #  endif
 #else
 #  if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
 #    define SIMD_LITTLE_ENDIAN
 #  endif
 #endif
 #endif
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -1,7 +1,6 @@
 #ifndef SIMD_HASH_2WAY_H__
 #define SIMD_HASH_2WAY_H__ 1
 #include "simd-compat.h"
 #include "simd-utils.h"
 #if defined(__SSE2__) || defined (__ARM_NEON)
@@ -34,7 +33,7 @@ typedef struct
  unsigned int hashbitlen;
  unsigned int blocksize;
  unsigned int n_feistels;
-} simd512_2way_context __attribute__((aligned(128)));
+} simd512_2way_context __attribute__((aligned(64)));
 #define simd_2way_context simd512_2way_context
 // databitlen is bits
--- a/algo/simd/vector.c
+++ b/algo/simd/vector.c
@@ -1,948 +0,0 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include "nist.h"
 #include "vector.h"
 //#if defined(__SSE2__) || defined(__ARM_NEON)
 #if defined(__SSE2__)
 #define PRINT_SOME 0
 /*
 int SupportedLength(int hashbitlen) {
  if (hashbitlen <= 0 || hashbitlen > 512)
    return 0;
  else
    return 1;
 }
 */
 int RequiredAlignment(void) {
  return 16;
 }
 static const union cv V128 = CV(128);
 static const union cv V255 = CV(255);
 static const union cv V257 = CV(257);
 static const union cv8  V0 = CV(0);
 /*
 * Reduce modulo 257; result is in [-127; 383]
 * REDUCE(x) := (x&255) - (x>>8)
 */
 #define REDUCE(x)                               \
  v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
 /*
 * Reduce from [-127; 383] to [-128; 128]
 * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
 */
 #define EXTRA_REDUCE_S(x)                       \
  v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
 /*
 * Reduce modulo 257; result is in [-128; 128]
 */
 #define REDUCE_FULL_S(x)                        \
  EXTRA_REDUCE_S(REDUCE(x))
 #define DO_REDUCE(i)                            \
  X(i) = REDUCE(X(i))
 #define DO_REDUCE_FULL_S(i)                     \
  do {                                          \
    X(i) = REDUCE(X(i));                        \
    X(i) = EXTRA_REDUCE_S(X(i));                \
  } while(0)
 #define MAYBE_VOLATILE
 MAYBE_INLINE void fft64(void *a) {
  v16* const A = a;
  register v16 X0, X1, X2, X3, X4, X5, X6, X7;
 /*
 #if V16_SIZE == 8
 #define X(i) A[i]
 #elif V16_SIZE == 4
 #define X(i) A[2*i]
 #endif
 */
 #define X(i) X##i
  X0 = A[0];
  X1 = A[1];
  X2 = A[2];
  X3 = A[3];
  X4 = A[4];
  X5 = A[5];
  X6 = A[6];
  X7 = A[7];
 #define DO_REDUCE(i)                            \
  X(i) = REDUCE(X(i))
  /*
   * Begin with 8 parallels DIF FFT_8
   *
   * FFT_8 using w=4 as 8th root of unity
   *  Unrolled decimation in frequency (DIF) radix-2 NTT.
   *  Output data is in revbin_permuted order.
   */
  static const int w[] = {0, 2, 4, 6};
  //  v16 *Twiddle = (v16*)FFT64_Twiddle;
 #define BUTTERFLY(i,j,n)                                \
  do {                                                  \
    MAYBE_VOLATILE v16 v = X(j);                              \
    X(j) =  v16_add(X(i), X(j));                        \
    if (n)                                              \
      X(i) = v16_shift_l(v16_sub(X(i), v), w[n]);       \
    else                                                \
      X(i) = v16_sub(X(i), v);                          \
  } while(0)
  BUTTERFLY(0, 4, 0);
  BUTTERFLY(1, 5, 1);
  BUTTERFLY(2, 6, 2);
  BUTTERFLY(3, 7, 3);
  DO_REDUCE(2);
  DO_REDUCE(3);
  BUTTERFLY(0, 2, 0);
  BUTTERFLY(4, 6, 0);
  BUTTERFLY(1, 3, 2);
  BUTTERFLY(5, 7, 2);
  DO_REDUCE(1);
  BUTTERFLY(0, 1, 0);
  BUTTERFLY(2, 3, 0);
  BUTTERFLY(4, 5, 0);
  BUTTERFLY(6, 7, 0);
  /* We don't need to reduce X(7) */
  DO_REDUCE_FULL_S(0);
  DO_REDUCE_FULL_S(1);
  DO_REDUCE_FULL_S(2);
  DO_REDUCE_FULL_S(3);
  DO_REDUCE_FULL_S(4);
  DO_REDUCE_FULL_S(5);
  DO_REDUCE_FULL_S(6);
 #undef BUTTERFLY
  /*
   * Multiply by twiddle factors
   */
  X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
  X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
  X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
  X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
  X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
  X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
  X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
  /*
   * Transpose the FFT state with a revbin order permutation
   * on the rows and the column.
   * This will make the full FFT_64 in order.
   */
 #define INTERLEAVE(i,j)                          \
  do {                                           \
    v16 t1= X(i);                                \
    v16 t2= X(j);                                \
    X(i) = v16_interleavel(t1, t2);              \
    X(j) = v16_interleaveh(t1, t2);              \
  } while(0)
  INTERLEAVE(1, 0);
  INTERLEAVE(3, 2);
  INTERLEAVE(5, 4);
  INTERLEAVE(7, 6);
  INTERLEAVE(2, 0);
  INTERLEAVE(3, 1);
  INTERLEAVE(6, 4);
  INTERLEAVE(7, 5);
  INTERLEAVE(4, 0);
  INTERLEAVE(5, 1);
  INTERLEAVE(6, 2);
  INTERLEAVE(7, 3);
 #undef INTERLEAVE
  /*
   * Finish with 8 parallels DIT FFT_8
   *
   * FFT_8 using w=4 as 8th root of unity
   *  Unrolled decimation in time (DIT) radix-2 NTT.
   *  Intput data is in revbin_permuted order.
   */
 #define BUTTERFLY(i,j,n)                                \
  do {                                                  \
    MAYBE_VOLATILE v16 u = X(j);                              \
    if (n)                                              \
      X(i) = v16_shift_l(X(i), w[n]);                   \
    X(j) = v16_sub(X(j), X(i));                         \
    X(i) = v16_add(u, X(i));                            \
  } while(0)
  DO_REDUCE(0);
  DO_REDUCE(1);
  DO_REDUCE(2);
  DO_REDUCE(3);
  DO_REDUCE(4);
  DO_REDUCE(5);
  DO_REDUCE(6);
  DO_REDUCE(7);
  BUTTERFLY(0, 1, 0);
  BUTTERFLY(2, 3, 0);
  BUTTERFLY(4, 5, 0);
  BUTTERFLY(6, 7, 0);
  BUTTERFLY(0, 2, 0);
  BUTTERFLY(4, 6, 0);
  BUTTERFLY(1, 3, 2);
  BUTTERFLY(5, 7, 2);
  DO_REDUCE(3);
  BUTTERFLY(0, 4, 0);
  BUTTERFLY(1, 5, 1);
  BUTTERFLY(2, 6, 2);
  BUTTERFLY(3, 7, 3);
  DO_REDUCE_FULL_S(0);
  DO_REDUCE_FULL_S(1);
  DO_REDUCE_FULL_S(2);
  DO_REDUCE_FULL_S(3);
  DO_REDUCE_FULL_S(4);
  DO_REDUCE_FULL_S(5);
  DO_REDUCE_FULL_S(6);
  DO_REDUCE_FULL_S(7);
 #undef BUTTERFLY
  A[0] = X0;
  A[1] = X1;
  A[2] = X2;
  A[3] = X3;
  A[4] = X4;
  A[5] = X5;
  A[6] = X6;
  A[7] = X7;
 #undef X
 }
 MAYBE_INLINE void fft128(void *a) {
  int i;
  // Temp space to help for interleaving in the end
  v16 B[8];
  v16 *A = (v16*) a;
  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
  /* Size-2 butterflies */
  for (i = 0; i<8; i++) {
    B[i]   = v16_add(A[i], A[i+8]);
    B[i]   = REDUCE_FULL_S(B[i]);
    A[i+8] = v16_sub(A[i], A[i+8]);
    A[i+8] = REDUCE_FULL_S(A[i+8]);
    A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
    A[i+8] = REDUCE_FULL_S(A[i+8]);
  }
  fft64(B);
  fft64(A+8);
  /* Transpose (i.e. interleave) */
  for (i=0; i<8; i++) {
    A[2*i]   = v16_interleavel (B[i], A[i+8]);
    A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
  }
 }
 #ifdef v16_broadcast
 /* Compute the FFT using a table
 * The function works if the value of the message is smaller 
 * than 2^14.
 */
 void fft128_msg_final(short *a, const unsigned char *x) {
  static const union cv FFT128_Final_Table[] = {
    {{   1, -211,   60,  -67,    2,   92, -137,  123}},
    {{   2,  118,   45,  111,   97,  -46,   49, -106}},
    {{   4,  -73,  -17,  -11,    8,  111,  -34,  -22}},
    {{ -68,   -4,   76,  -25,   96,  -96,  -68,   -9}},
    {{  16,  -35,  -68,  -44,   32,  -70, -136,  -88}},
    {{   0, -124,   17,   12,   -6,   57,   47,   -8}},
    {{  64,  117,  -15,   81,  128,  -23,  -30,  -95}},
    {{ -68,  -53,  -52,  -70,  -10, -117,   77,   21}},
    {{  -1,  -46,  -60,   67,   -2,  -92, -120, -123}},
    {{  -2, -118,  -45, -111,  -97,   46,  -49,  106}},
    {{  -4,   73,   17,   11,   -8, -111,   34,   22}},
    {{  68,    4,  -76,   25,  -96,   96,   68,    9}},
    {{ -16, -222,   68,   44,  -32,   70, -121,   88}},
    {{   0,  124,  -17,  -12,    6,  -57,  -47,    8}},
    {{ -64, -117,   15,  -81, -128, -234,   30,   95}},
    {{  68,   53,   52,   70,   10,  117,  -77,  -21}},
    {{-118,  -31,  116,  -61,   21,  -62,  -25, -122}},
    {{-101,  107,  -45,  -95,   -8,    3,  101,  -34}},
    {{  42, -124,  -50,   13,   84,    9, -100, -231}},
    {{ -79,  -53,   82,   65,  -81,   47,   61,  107}},
    {{ -89, -239,   57, -205, -178,   36, -143,  104}},
    {{-126,  113,   33,  111,  103, -109,   65, -114}},
    {{ -99,   72,  -29,  -49, -198, -113,  -58,  -98}},
    {{   8,  -27, -106,  -30,  111,    6,   10, -108}},
    {{-139,   31, -116, -196,  -21,   62,   25, -135}},
    {{ 101, -107,   45,   95,    8,   -3, -101,   34}},
    {{ -42, -133,   50,  -13,  -84,   -9,  100,  -26}},
    {{  79,   53,  -82,  -65,   81,  -47,  -61, -107}},
    {{-168,  -18,  -57,  -52,  -79,  -36, -114, -104}},
    {{ 126, -113,  -33, -111, -103,  109,  -65,  114}},
    {{  99,  -72, -228,   49,  -59,  113,   58, -159}},
    {{  -8,   27,  106,   30, -111,   -6,  -10,  108}}
  };
  //  v16 *Table = (v16*)FFT128_Final_Table;
  v16 *A = (v16*) a;
  v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
  v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
  // v16 msg2 = v16_broadcast(x[1]);
 #if 0
  int i;
  for (i=0; i<16; i++) {
    v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16  , msg2);
    v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
    sum = v16_add(sum, tmp);
    A[i] = REDUCE_FULL_S(sum);
  }
 #else
 #define FFT_FINAL(i)                                           \
  v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2);     \
  v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);   \
  sum##i = v16_add(sum##i, tmp##i);                            \
  A[i] = REDUCE_FULL_S(sum##i);
  FFT_FINAL(0)
  FFT_FINAL(1)
  FFT_FINAL(2)
  FFT_FINAL(3)
  FFT_FINAL(4)
  FFT_FINAL(5)
  FFT_FINAL(6)
  FFT_FINAL(7)
  FFT_FINAL(8)
  FFT_FINAL(9)
  FFT_FINAL(10)
  FFT_FINAL(11)
  FFT_FINAL(12)
  FFT_FINAL(13)
  FFT_FINAL(14)
  FFT_FINAL(15)
 #endif
 }
 #endif
 void fft128_msg(short *a, const unsigned char *x, int final) {
  static const union cv Tweak =
    {{0,0,0,0,0,0,0,1}};
  static const union cv FinalTweak =
    {{0,0,0,0,0,1,0,1}};
  v8  *X = (v8*)  x;
  v16 *A = (v16*) a;
  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
 #define UNPACK(i)                                      \
  do {                                                 \
    v8 t = X[i];                                       \
    A[2*i]   = v8_mergel(t, V0.v8);                    \
    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
    A[2*i+8] = REDUCE(A[2*i+8]);                       \
    A[2*i+1] = v8_mergeh(t, V0.v8);                    \
    A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16);      \
    A[2*i+9] = REDUCE(A[2*i+9]);                       \
  } while(0)
  /* 
   * This allows to tweak the last butterflies to introduce X^127
   */
 #define UNPACK_TWEAK(i,tw)                             \
  do {                                                 \
    v8 t = X[i];                                       \
    v16 tmp;                                           \
    A[2*i]   = v8_mergel(t, V0.v8);                    \
    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
    A[2*i+8] = REDUCE(A[2*i+8]);                       \
    tmp      = v8_mergeh(t, V0.v8);                    \
    A[2*i+1] = v16_add(tmp, tw);                               \
    A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16);      \
    A[2*i+9] = REDUCE(A[2*i+9]);                       \
  } while(0)
  UNPACK(0);
  UNPACK(1);
  UNPACK(2);
  if (final)
    UNPACK_TWEAK(3, FinalTweak.v16);
  else
    UNPACK_TWEAK(3, Tweak.v16);
 #undef UNPACK
 #undef UNPACK_TWEAK
  fft64(a);
  fft64(a+64);
 }
 #if 0
 void fft128_msg(short *a, const unsigned char *x, int final) {
  for (int i=0; i<64; i++)
    a[i] = x[i];
  for (int i=64; i<128; i++)
    a[i] = 0;
  a[127] = 1;
  a[125] = final? 1: 0;
  fft128(a);
 }
 #endif
 void fft256_msg(short *a, const unsigned char *x, int final) {
  static const union cv Tweak =
    {{0,0,0,0,0,0,0,1}};
  static const union cv FinalTweak =
    {{0,0,0,0,0,1,0,1}};
  v8  *X = (v8*)  x;
  v16 *A = (v16*) a;
  //  v16 *Twiddle = (v16*)FFT256_Twiddle;
 #define UNPACK(i)                                       \
  do {                                                  \
    v8 t      = X[i];                                   \
    A[2*i]    = v8_mergel(t, V0.v8);                    \
    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
    A[2*i+16] = REDUCE(A[2*i+16]);                      \
    A[2*i+1]  = v8_mergeh(t, V0.v8);                    \
    A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16);      \
    A[2*i+17] = REDUCE(A[2*i+17]);                       \
  } while(0)
  /* 
   * This allows to tweak the last butterflies to introduce X^127
   */
 #define UNPACK_TWEAK(i,tw)                              \
  do {                                                  \
    v8 t = X[i];                                        \
    v16 tmp;                                            \
    A[2*i]    = v8_mergel(t, V0.v8);                    \
    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
    A[2*i+16] = REDUCE(A[2*i+16]);                       \
    tmp       = v8_mergeh(t, V0.v8);                    \
    A[2*i+1]  = v16_add(tmp, tw);                               \
    A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16);      \
    A[2*i+17] = REDUCE(A[2*i+17]);                      \
  } while(0)
  UNPACK(0);
  UNPACK(1);
  UNPACK(2);
  UNPACK(3);
  UNPACK(4);
  UNPACK(5);
  UNPACK(6);
  if (final)
    UNPACK_TWEAK(7, FinalTweak.v16);
  else
    UNPACK_TWEAK(7, Tweak.v16);
 #undef UNPACK
 #undef UNPACK_TWEAK
  fft128(a);
  fft128(a+128);
 }
 void rounds(u32* state, const unsigned char* msg, short* fft) {
  v32* S = (v32*) state;
  const v32* M = (v32*)msg;
  volatile v16* W = (v16*)fft;
  register v32 S0, S1, S2, S3;
  static const union cv code[] = { CV(185), CV(233) };
  S0 = v32_xor(S[0], v32_bswap(M[0]));
  S1 = v32_xor(S[1], v32_bswap(M[1]));
  S2 = v32_xor(S[2], v32_bswap(M[2]));
  S3 = v32_xor(S[3], v32_bswap(M[3]));
 #define S(i) S##i
 /* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
 /* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
 #define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
 #define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
 #define F(a,b,c,fun) F_##fun (a,b,c)
  /*
   * We split the round function in two halfes
   * so as to insert some independent computations in between
   */
 #define SUM3_00 1
 #define SUM3_01 2
 #define SUM3_02 3
 #define SUM3_10 2
 #define SUM3_11 3
 #define SUM3_12 1
 #define SUM3_20 3
 #define SUM3_21 1
 #define SUM3_22 2
 #define STEP_1(a,b,c,d,w,fun,r,s,z)                             \
  do {                                                          \
    if (PRINT_SOME) {                                           \
      int j;                                                    \
      v32 ww=w, aa=a, bb=b, cc=c, dd=d;                         \
      u32 *WW = (void*)&ww;                                     \
      u32 *AA = (void*)&aa;                                     \
      u32 *BB = (void*)&bb;                                     \
      u32 *CC = (void*)&cc;                                     \
      u32 *DD = (void*)&dd;                                     \
      for (j=0; j<4; j++) {                                     \
        printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n",      \
                WW[j], r, s, SUM3_##z,                          \
                AA[j], BB[j], CC[j], DD[j]);                    \
      }                                                         \
    }                                                           \
    TT = F(a,b,c,fun);                                          \
    a = v32_rotate(a,r);                                        \
    w = v32_add(w, d);                                          \
    TT = v32_add(TT, w);                                        \
    TT = v32_rotate(TT,s);                                      \
    d = v32_shufxor(a,SUM3_##z);                                \
  } while(0)
 #define STEP_2(a,b,c,d,w,fun,r,s)                               \
  do {                                                          \
    d = v32_add(d, TT);                                         \
  } while(0)
 #define STEP(a,b,c,d,w,fun,r,s,z)               \
  do {                                          \
    register v32 TT;                            \
    STEP_1(a,b,c,d,w,fun,r,s,z);                \
    STEP_2(a,b,c,d,w,fun,r,s);                  \
  } while(0);
 #define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
              fun,r,s,t,u,z,r0)                           \
  do {                                                    \
    register v32 W0, W1, W2, W3, TT;                      \
    W0 = v16_merge##u0(W[h0], W[l0]);                     \
    W0 = V1632(v16_mul(V3216(W0), code[z].v16));          \
    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
    W1 = v16_merge##u1(W[h1], W[l1]);                     \
    W1 = V1632(v16_mul(V3216(W1), code[z].v16));          \
    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
    W2 = v16_merge##u2(W[h2], W[l2]);                     \
    W2 = V1632(v16_mul(V3216(W2), code[z].v16));          \
    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
    W3 = v16_merge##u3(W[h3], W[l3]);                     \
    W3 = V1632(v16_mul(V3216(W3), code[z].v16));          \
    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
  } while(0)
  /*
   * 4 rounds with code 185
   */
  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0, 0);
  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0, 1);
  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22,  7, 0, 2);
  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22,  7, 0, 0);
  /*
   * 4 rounds with code 233
   */
  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1, 1);
  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1, 2);
  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1, 0);
  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1, 1);
  /*
   * 1 round as feed-forward
   */
  STEP(S(0), S(1), S(2), S(3), S[0], 0,  4, 13, 20);
  STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
  STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
  STEP(S(1), S(2), S(3), S(0), S[3], 0, 25,  4, 20);
  S[0] = S(0);  S[1] = S(1);  S[2] = S(2);  S[3] = S(3);
 #undef ROUND
 #undef STEP
 #undef STEP_1
 #undef STEP_2
 }
 void rounds512(u32* state, const unsigned char* msg, short* fft) {
  v32* S = (v32*) state;
  v32* M = (v32*) msg;
  v16* W = (v16*) fft;
  register v32 S0l, S1l, S2l, S3l;
  register v32 S0h, S1h, S2h, S3h;
  static const union cv code[] = { CV(185), CV(233) };
  S0l = v32_xor(S[0], v32_bswap(M[0]));
  S0h = v32_xor(S[1], v32_bswap(M[1]));
  S1l = v32_xor(S[2], v32_bswap(M[2]));
  S1h = v32_xor(S[3], v32_bswap(M[3]));
  S2l = v32_xor(S[4], v32_bswap(M[4]));
  S2h = v32_xor(S[5], v32_bswap(M[5]));
  S3l = v32_xor(S[6], v32_bswap(M[6]));
  S3h = v32_xor(S[7], v32_bswap(M[7]));
 #define S(i) S##i
 /* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
 /* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
 #define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
 #define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
 #define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
 #define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
  /*
   * We split the round function in two halfes
   * so as to insert some independent computations in between
   */
 #define SUM7_00 0
 #define SUM7_01 1
 #define SUM7_02 2
 #define SUM7_03 3
 #define SUM7_04 4
 #define SUM7_05 5
 #define SUM7_06 6
 #define SUM7_10 1
 #define SUM7_11 2
 #define SUM7_12 3
 #define SUM7_13 4
 #define SUM7_14 5
 #define SUM7_15 6
 #define SUM7_16 0
 #define SUM7_20 2
 #define SUM7_21 3
 #define SUM7_22 4
 #define SUM7_23 5
 #define SUM7_24 6
 #define SUM7_25 0
 #define SUM7_26 1
 #define SUM7_30 3
 #define SUM7_31 4
 #define SUM7_32 5
 #define SUM7_33 6
 #define SUM7_34 0
 #define SUM7_35 1
 #define SUM7_36 2
 #define SUM7_40 4
 #define SUM7_41 5
 #define SUM7_42 6
 #define SUM7_43 0
 #define SUM7_44 1
 #define SUM7_45 2
 #define SUM7_46 3
 #define SUM7_50 5
 #define SUM7_51 6
 #define SUM7_52 0
 #define SUM7_53 1
 #define SUM7_54 2
 #define SUM7_55 3
 #define SUM7_56 4
 #define SUM7_60 6
 #define SUM7_61 0
 #define SUM7_62 1
 #define SUM7_63 2
 #define SUM7_64 3
 #define SUM7_65 4
 #define SUM7_66 5
 #define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
 #define PERM_0(d,a) /* XOR 1 */           \
  do {                                    \
    d##l = v32_shufxor(a##l,1);           \
    d##h = v32_shufxor(a##h,1);           \
  } while(0)
 #define PERM_1(d,a) /* XOR 6 */           \
  do {                                    \
    d##l = v32_shufxor(a##h,2);           \
    d##h = v32_shufxor(a##l,2);           \
  } while(0)
 #define PERM_2(d,a) /* XOR 2 */           \
  do {                                    \
    d##l = v32_shufxor(a##l,2);           \
    d##h = v32_shufxor(a##h,2);           \
  } while(0)
 #define PERM_3(d,a) /* XOR 3 */           \
  do {                                    \
    d##l = v32_shufxor(a##l,3);           \
    d##h = v32_shufxor(a##h,3);           \
  } while(0)
 #define PERM_4(d,a) /* XOR 5 */           \
  do {                                    \
    d##l = v32_shufxor(a##h,1);           \
    d##h = v32_shufxor(a##l,1);           \
  } while(0)
 #define PERM_5(d,a) /* XOR 7 */           \
  do {                                    \
    d##l = v32_shufxor(a##h,3);           \
    d##h = v32_shufxor(a##l,3);           \
  } while(0)
 #define PERM_6(d,a) /* XOR 4 */           \
  do {                                    \
    d##l = a##h;                          \
    d##h = a##l;                          \
  } while(0)
 #define STEP_1_(a,b,c,d,w,fun,r,s,z)                            \
  do {                                                          \
    if (PRINT_SOME) {                                           \
      int j;                                                    \
      v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l;          \
      u32 *WW = (void*)&ww;                                     \
      u32 *AA = (void*)&aa;                                     \
      u32 *BB = (void*)&bb;                                     \
      u32 *CC = (void*)&cc;                                     \
      u32 *DD = (void*)&dd;                                     \
      for (j=0; j<4; j++) {                                     \
        printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n",          \
                WW[j], r, s,                                    \
                AA[j], BB[j], CC[j], DD[j]);                    \
      }                                                         \
    }                                                           \
    TTl = Fl(a,b,c,fun);                                        \
    TTh = Fh(a,b,c,fun);                                        \
    a##l = v32_rotate(a##l,r);                                  \
    a##h = v32_rotate(a##h,r);                                  \
    w##l  = v32_add(w##l, d##l);                                \
    w##h  = v32_add(w##h, d##h);                                \
    TTl = v32_add(TTl, w##l);                                   \
    TTh = v32_add(TTh, w##h);                                   \
    TTl = v32_rotate(TTl,s);                                    \
    TTh = v32_rotate(TTh,s);                                    \
    PERM(z,d,a);                                                \
  } while(0)
 #define STEP_1(a,b,c,d,w,fun,r,s,z)             \
  STEP_1_(a,b,c,d,w,fun,r,s,z)
 #define STEP_2_(a,b,c,d,w,fun,r,s)                               \
  do {                                                          \
    d##l = v32_add(d##l, TTl);                                  \
    d##h = v32_add(d##h, TTh);                                  \
  } while(0)
 #define STEP_2(a,b,c,d,w,fun,r,s)              \
  STEP_2_(a,b,c,d,w,fun,r,s)
 #define STEP(a,b,c,d,w1,w2,fun,r,s,z)           \
  do {                                          \
    register v32 TTl, TTh, Wl=w1, Wh=w2;        \
    STEP_1(a,b,c,d,W,fun,r,s,z);                \
    STEP_2(a,b,c,d,W,fun,r,s);                  \
  } while(0);
 #define MSG_l(x) (2*(x))
 #define MSG_h(x) (2*(x)+1)
 #define MSG(w,hh,ll,u,z)                                \
  do {                                                  \
    int a = MSG_##u(hh);                                \
    int b = MSG_##u(ll);                                \
    w##l = v16_mergel(W[a], W[b]);                      \
    w##l = V1632(v16_mul(V3216(w##l), code[z].v16));    \
    w##h = v16_mergeh(W[a], W[b]);                      \
    w##h = V1632(v16_mul(V3216(w##h), code[z].v16));    \
  } while(0)
 #define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
              fun,r,s,t,u,z)                              \
  do {                                                    \
    register v32 W0l, W1l, W2l, W3l, TTl;                 \
    register v32 W0h, W1h, W2h, W3h, TTh;                 \
    MSG(W0,h0,l0,u0,z);                                   \
    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0);     \
    MSG(W1,h1,l1,u1,z);                                   \
    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1);     \
    MSG(W2,h2,l2,u2,z);                                   \
    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2);     \
    MSG(W3,h3,l3,u3,z);                                   \
    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3);     \
    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
  } while(0)
  /*
   * 4 rounds with code 185
   */
 #define PERM_START 0
  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 4
  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
 #undef PERM_START
 #define PERM_START 1
  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
 #undef PERM_START
 #define PERM_START 5
  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
 #undef PERM_START
  /*
   * 4 rounds with code 233
   */
 #define PERM_START 2
  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 6
  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
 #undef PERM_START
 #define PERM_START 3
  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
 #undef PERM_START
 #define PERM_START 0
  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
 #undef PERM_START
  /*
   * 1 round as feed-forward
   */
 #define PERM_START 4
  STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0);
  STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
  STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
  STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3);
 #undef PERM_START
  S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
  S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
 #undef ROUND
 #undef STEP
 #undef STEP_1
 #undef STEP_2
 }
 void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
  if (state->hashbitlen <= 256) {
    union cv Y[16];
    short* y = (short*) Y[0].u16;
 #ifdef v16_broadcast
    if (final == 2) {
      fft128_msg_final(y, m);
      rounds(state->A, m, y);
    } else {
      fft128_msg(y, m, final);
      rounds(state->A, m, y);
    }
 #else
    fft128_msg(y, m, final);
    rounds(state->A, m, y);
 #endif
  } else {
    union cv Y[32];
    short* y = (short*) Y[0].u16;
    fft256_msg(y, m, final);
    rounds512(state->A, m, y);
  }
 }
 /* 
 * Give the FFT output in the regular order for consitancy checks
 */
 void fft128_natural(fft_t *x, unsigned char *a) {
  union cv Y[16];
  short* y = (short*) Y[0].u16;
  int i;
  fft128_msg(y, a, 0);
  for(i=0; i<64; i++) {
    x[2*i]   = y[i];
    x[2*i+1] = y[i+64];
  }
 }
 #endif // SSE2
--- a/algo/simd/vector.h
+++ b/algo/simd/vector.h
@@ -1,246 +0,0 @@
 #ifndef __VECTOR_H__
 #define __VECTOR_H__
 #include "compat.h"
 #include "simd-utils.h"
 /******************************* 
 * Using GCC vector extensions * 
 *******************************/
 //typedef unsigned char v16qi __attribute__ ((vector_size (16)));
 typedef char          v16qi __attribute__ ((vector_size (16)));
 typedef short          v8hi __attribute__ ((vector_size (16)));
 typedef int            v4si __attribute__ ((vector_size (16)));
 typedef float          v4sf __attribute__ ((vector_size (16)));
 typedef long long int  v2di __attribute__ ((vector_size (16)));
 typedef short          v4hi __attribute__ ((vector_size (8)));
 typedef unsigned char  v8qi __attribute__ ((vector_size (8)));
 typedef v16qi v8;
 typedef v8hi v16;
 typedef v4si v32;
 #define V16_SIZE 8
 union cv {
  unsigned short u16[8];
  v16 v16;
 };
 union cv8 {
  unsigned char u8[16];
  v8 v8;
 };
 union u32 {
  u32 u[4];
  v32 v;
 };
 #define V3216(x) ((v16) (x))
 #define V1632(x) ((v32) (x))
 #define  V168(x) ( (v8) (x))
 #define  V816(x) ((v16) (x))
 #if 0
 /* These instruction are shorter than the PAND/POR/... that GCC uses */
 #define vec_and(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
 #define vec_or(x,y)   ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
 #define vec_xor(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
 #define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
 #define v16_and(x,y)  ((v16) vec_and ((x), (y)))
 #define v16_or(x,y)   ((v16) vec_or  ((x), (y)))
 #define v16_xor(x,y)  ((v16) vec_xor ((x), (y)))
 #define v16_andn(x,y) ((v16) vec_andn((x), (y)))
 #define v32_and(x,y)  ((v32) vec_and ((x), (y)))
 #define v32_or(x,y)   ((v32) vec_or  ((x), (y)))
 #define v32_xor(x,y)  ((v32) vec_xor ((x), (y)))
 #define v32_andn(x,y) ((v32) vec_andn((x), (y)))
 #endif
 #if defined(__SSE2__)
 #define vec_and(x,y) ((x)&(y))
 #define vec_or(x,y)  ((x)|(y))
 #define vec_xor(x,y) ((x)^(y))
 #define v16_and vec_and
 #define v16_or  vec_or
 #define v16_xor vec_xor
 #define v32_and vec_and
 #define v32_or  vec_or
 #define v32_xor vec_xor
 #define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
 #define v16_andn(x,y) ((v16) vec_andn(x,y))
 #define v32_andn(x,y) ((v32) vec_andn(x,y))
 #define v32_add(x,y) ((x)+(y))
 #define v16_add(x,y) ((x)+(y))
 #define v16_sub(x,y) ((x)-(y))
 #define v16_mul(x,y) ((x)*(y))
 #define v16_neg(x)   (-(x))
 #define v16_shift_l  __builtin_ia32_psllwi128
 #define v16_shift_r  __builtin_ia32_psrawi128
 #define v16_cmp      __builtin_ia32_pcmpgtw128
 #define v16_interleavel   __builtin_ia32_punpcklwd128
 #define v16_interleaveh   __builtin_ia32_punpckhwd128
 #define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
 #define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
 #define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
 #define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
 #define v32_shift_l  __builtin_ia32_pslldi128
 #define v32_shift_r  __builtin_ia32_psrldi128
 #define v32_rotate(x,n)                                 \
  v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
 #define v32_shuf __builtin_ia32_pshufd
 #define SHUFXOR_1 0xb1          /* 0b10110001 */
 #define SHUFXOR_2 0x4e          /* 0b01001110 */
 #define SHUFXOR_3 0x1b          /* 0b00011011 */
 #define CAT(x, y) x##y
 #define XCAT(x,y) CAT(x,y)
 #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
 #define v32_bswap(x) (x)
 #define v16_broadcast(x) ({                     \
      union u32 u;                              \
      u32 xx = x;                               \
      u.u[0] = xx | (xx << 16);                 \
      V3216(v32_shuf(u.v,0)); })
 #define CV(x) {{x, x, x, x, x, x, x, x}}
 #elif defined(__aarch64__) && defined(__ARM_NEON)
 #define vec_and( x, y )    v128_and( x, y )
 #define vec_or(x,y)        v128_or( x, y )
 #define vec_xor(x,y)       v128_xor( x, y )
 #define v16_and v128_and
 #define v16_or  v128_or
 #define v16_xor v128_xor
 #define v32_and v128_and
 #define v32_or  v128_or
 #define v32_xor v128_xor
 #define vec_andn( x,y )   v128_andnot( x, y )
 #define v16_andn          vec_andn 
 #define v32_andn          vec_andn
 #define v32_add( x, y )   v128_add32( x, y )
 #define v16_add( x, y )        v128_add16( x, y )
 #define v16_sub( x, y )        v128_sub16( x, y )
 #define v16_mul( x, y )        v128_mul16( x, y )
 #define v16_neg(x)             v128_negate16( x )
 #define v16_shift_l( x, c )    v128_sl16
 #define v16_shift_r            v128_sr16
 #define v16_cmp                v128_cmpgt16
 #define v16_interleavel        v128_unpacklo16
 #define v16_interleaveh        v128_unpackhi16 
 #define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
 #define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
 #define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
 #define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
 #define v32_shift_l            v128_sl32
 #define v32_shift_r            v128_sr32
 #define v32_rotate(x,n)        v128_rol32
 #define v32_shuf __builtin_ia32_pshufd
 #define SHUFXOR_1 0xb1          /* 0b10110001 */
 #define SHUFXOR_2 0x4e          /* 0b01001110 */
 #define SHUFXOR_3 0x1b          /* 0b00011011 */
 #define CAT(x, y) x##y
 #define XCAT(x,y) CAT(x,y)
 #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
 #define v32_bswap(x) (x)
 #define v16_broadcast(x) ({                     \
      union u32 u;                              \
      u32 xx = x;                               \
      u.u[0] = xx | (xx << 16);                 \
      V3216(v32_shuf(u.v,0)); })
 #define CV(x) {{x, x, x, x, x, x, x, x}}
 #else
 #error "I don't know how to vectorize on this architecture."
 #endif
 /* Twiddle tables */
  static const union cv FFT64_Twiddle[] = {
    {{1,    2,    4,    8,   16,   32,   64,  128}},
    {{1,   60,    2,  120,    4,  -17,    8,  -34}},
    {{1,  120,    8,  -68,   64,  -30,   -2,   17}},
    {{1,   46,   60,  -67,    2,   92,  120,  123}},
    {{1,   92,  -17,  -22,   32,  117,  -30,   67}},
    {{1,  -67,  120,  -73,    8,  -22,  -68,  -70}},
    {{1,  123,  -34,  -70,  128,   67,   17,   35}},
  };
  static const union cv FFT128_Twiddle[] =  {
    {{  1, -118,   46,  -31,   60,  116,  -67,  -61}},
    {{  2,   21,   92,  -62,  120,  -25,  123, -122}},
    {{  4,   42,  -73, -124,  -17,  -50,  -11,   13}},
    {{  8,   84,  111,    9,  -34, -100,  -22,   26}},
    {{ 16,  -89,  -35,   18,  -68,   57,  -44,   52}},
    {{ 32,   79,  -70,   36,  121,  114,  -88,  104}},
    {{ 64,  -99,  117,   72,  -15,  -29,   81,  -49}},
    {{128,   59,  -23, -113,  -30,  -58,  -95,  -98}},
  };
  static const union cv FFT256_Twiddle[] =  {
    {{   1,   41, -118,   45,   46,   87,  -31,   14}}, 
    {{  60, -110,  116, -127,  -67,   80,  -61,   69}}, 
    {{   2,   82,   21,   90,   92,  -83,  -62,   28}}, 
    {{ 120,   37,  -25,    3,  123,  -97, -122, -119}}, 
    {{   4,  -93,   42,  -77,  -73,   91, -124,   56}}, 
    {{ -17,   74,  -50,    6,  -11,   63,   13,   19}}, 
    {{   8,   71,   84,  103,  111,  -75,    9,  112}}, 
    {{ -34, -109, -100,   12,  -22,  126,   26,   38}}, 
    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33}}, 
    {{ -68,   39,   57,   24,  -44,   -5,   52,   76}}, 
    {{  32,   27,   79, -102,  -70,  -43,   36,  -66}}, 
    {{ 121,   78,  114,   48,  -88,  -10,  104, -105}}, 
    {{  64,   54,  -99,   53,  117,  -86,   72,  125}}, 
    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47}}, 
    {{ 128,  108,   59,  106,  -23,   85, -113,   -7}}, 
    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94}}
  };
 #endif
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -13,11 +13,7 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
@@ -43,11 +39,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
 } c11_ctx_holder;
 c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
@@ -69,11 +61,6 @@ void init_c11_ctx()
   init_luffa( &c11_ctx.luffa, 512 );
   cubehashInit( &c11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &c11_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &c11_ctx.simd );
 #else
   init_sd( &c11_ctx.simd, 512 );
 #endif
 }
 void c11_hash( void *output, const void *input )
@@ -105,41 +92,35 @@ void c11_hash( void *output, const void *input )
    sph_skein512( &ctx.skein, (const void*) hash, 64 );
    sph_skein512_close( &ctx.skein, hash );
-     update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
+    update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
-     cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
+    cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
-     sph_shavite512( &ctx.shavite, hash, 64);
+    sph_shavite512( &ctx.shavite, hash, 64);
-     sph_shavite512_close( &ctx.shavite, hash);
+    sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                                   (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
-     update_final_echo ( &ctx.echo, (BitSequence *)hash,
+    update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                         (const BitSequence *)hash, 512 );
+                        (const BitSequence *)hash, 512 );
 #else
-     sph_echo512( &ctx.echo, hash, 64 );
+    sph_echo512( &ctx.echo, hash, 64 );
-     sph_echo512_close( &ctx.echo, hash );
+    sph_echo512_close( &ctx.echo, hash );
 #endif
-        memcpy(output, hash, 32);
+    memcpy(output, hash, 32);
 }
 int scanhash_c11( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
-        uint32_t hash[8] __attribute__((aligned(64)));
+   uint32_t hash[8] __attribute__((aligned(64)));
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-        const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
 	uint32_t nonce = first_nonce;
   int thr_id = mythr->id;
 	volatile uint8_t *restart = &(work_restart[thr_id].restart);
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -13,17 +13,13 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #else
  #include "algo/groestl/sph_groestl.h"
 #endif
-  #include "algo/luffa/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -37,11 +33,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cube;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
 #ifdef __AES__
        hashState_groestl       groestl;
 #else
@@ -62,11 +54,6 @@ void init_tt10_ctx()
        init_luffa( &tt10_ctx.luffa, 512 );
        cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
        sph_shavite512_init( &tt10_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &tt10_ctx.simd );
 #else
   init_sd( &tt10_ctx.simd, 512 );
 #endif
 #ifdef __AES__
        init_groestl( &tt10_ctx.groestl, 64 );
 #else
@@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input)
        }
        break;
     case 9:
-        if ( i == 0 )
+        simd512_ctx( &ctx.simd, hashB, hashA, dataLen );
        {
           memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd );
 #if defined(__aarch64__)
           sph_simd512(&ctx.simd, (const void*) input + midlen, tail );
           sph_simd512_close(&ctx.simd, hash);
 #else
           update_final_sd( &ctx.simd, (BitSequence *)hashB,
                            (const BitSequence *)input + midlen, tail*8 );
 #endif
        }
        else
        {
 #if defined(__aarch64__)
           sph_simd512(&ctx.simd, (const void*) hash, 64);
           sph_simd512_close(&ctx.simd, hash);
 #else
           update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 );
           final_sd( &ctx.simd, (BitSequence *)hashB );
 #endif
        }
        break;
     default:
 	break;
@@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
           memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) );
           sph_shavite512( &tt10_mid.shavite, endiandata, 64 );
           break;
        case 9:
           memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) );
 #if defined(__aarch64__)
           sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 );
           sph_simd512_close( &tt10_mid.simd, hash);
 #else
           update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 );
 #endif
           break;
        default:
           break;
      }
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -22,12 +22,7 @@
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 typedef struct {
   sph_blake512_context blake;
@@ -45,11 +40,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
 } x11_ctx_holder;
 x11_ctx_holder x11_ctx;
@@ -71,11 +62,6 @@ void init_x11_ctx()
   init_luffa( &x11_ctx.luffa, 512 );
   cubehashInit( &x11_ctx.cube, 512, 16, 32 );
   sph_shavite512_init( &x11_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &x11_ctx.simd );
 #else
   init_sd( &x11_ctx.simd, 512 );
 #endif
 }
 void x11_hash( void *state, const void *input )
@@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input )
    sph_shavite512( &ctx.shavite, hash, 64 );
    sph_shavite512_close( &ctx.shavite, hash );
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                       (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -20,11 +20,7 @@
  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
 typedef struct {
@@ -37,11 +33,7 @@ typedef struct {
 #endif
    hashState_luffa         luffa;
    cubehashParam           cube;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
    sph_blake512_context    blake;
    sph_bmw512_context      bmw;
    sph_skein512_context    skein;
@@ -63,11 +55,6 @@ void init_x11evo_ctx()
 #endif
     init_luffa( &x11evo_ctx.luffa, 512 );
     cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
 #if defined(__aarch64__)
     sph_simd512_init( &x11evo_ctx.simd );
 #else
     init_sd( &x11evo_ctx.simd, 512 );
 #endif
     sph_blake512_init( &x11evo_ctx.blake );
     sph_bmw512_init( &x11evo_ctx.bmw );
     sph_skein512_init( &x11evo_ctx.skein );
@@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input )
 	      sph_shavite512_close( &ctx.shavite, (char*)hash );
 	      break;
 	    case 9:
-#if defined(__aarch64__)
+         simd512_ctx( &ctx.simd, hash, hash, 64 );
         sph_simd512(&ctx.simd, (const void*) hash, 64);
         sph_simd512_close(&ctx.simd, hash);
 #else
         update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
 #endif
    break;
 	    case 10:
 #ifdef __AES__
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -17,12 +17,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/simd/sph_simd.h"
 #else
 #include "algo/simd/nist.h"
 #endif
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +42,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
  sph_simd512_context     simd;
 #else
  hashState_sd            simd;
 #endif
   sph_gost512_context     gost;
 } x11gost_ctx_holder;
@@ -75,11 +66,6 @@ void init_x11gost_ctx()
   sph_shavite512_init( &x11gost_ctx.shavite );
   init_luffa( &x11gost_ctx.luffa, 512 );
   cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
 #if defined(__aarch64__)
    sph_simd512_init(&x11gost_ctx.simd);
 #else
    init_sd( &x11gost_ctx.simd, 512 );
 #endif
 }
 void x11gost_hash(void *output, const void *input)
@@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64 );
    sph_shavite512_close( &ctx.shavite, hash );
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512 (&ctx.simd, hash, 64); 
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                                   (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
     update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -17,11 +17,7 @@
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
@@ -44,11 +40,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam            cubehash;
   sph_shavite512_context   shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
   sph_hamsi512_context     hamsi;
 } x12_ctx_holder;
@@ -68,14 +60,9 @@ void init_x12_ctx()
        sph_groestl512_init(&x12_ctx.groestl);
        sph_echo512_init(&x12_ctx.echo);
 #endif
-   init_luffa( &x12_ctx.luffa, 512 );
+        init_luffa( &x12_ctx.luffa, 512 );
        cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &x12_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &x12_ctx.simd );
 #else
   init_sd( &x12_ctx.simd, 512 );
 #endif
        sph_hamsi512_init( &x12_ctx.hamsi );
 };
@@ -101,13 +88,7 @@ void x12hash(void *output, const void *input)
   sph_shavite512( &ctx.shavite, hash, 64);
   sph_shavite512_close( &ctx.shavite, hashB);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hashB, 64 );
    sph_simd512(&ctx.simd, (const void*) hashB, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
    final_sd( &ctx.simd, (BitSequence *)hash );
 #endif
 #if defined(__AES__)
   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -15,11 +15,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/simd/sph_simd.h"
 #else
 #include "algo/simd/nist.h"
 #endif
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -48,11 +44,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
  sph_simd512_context     simd;
 #else
  hashState_sd            simd;
 #endif
   sph_hamsi512_context    hamsi;
 } x13_ctx_holder;
@@ -77,11 +69,6 @@ void init_x13_ctx()
   init_luffa( &x13_ctx.luffa, 512 );
   cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x13_ctx.shavite );
 #if defined(__aarch64__)
    sph_simd512_init(&x13_ctx.simd);
 #else
    init_sd( &x13_ctx.simd, 512 );
 #endif
   sph_hamsi512_init( &x13_ctx.hamsi );
 };
@@ -121,13 +108,7 @@ void x13hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                                   (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x13/x13bcd.c
+++ b/algo/x13/x13bcd.c
@@ -15,11 +15,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +43,7 @@ typedef struct {
   sph_skein512_context    skein;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
   sph_hamsi512_context    hamsi;
   sm3_ctx_t               sm3;
 } x13bcd_ctx_holder;
@@ -76,11 +68,6 @@ void init_x13bcd_ctx()
   sph_keccak512_init( &x13bcd_ctx.keccak );
   cubehashInit( &x13bcd_ctx.cube,512,16,32 );
   sph_shavite512_init( &x13bcd_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &x13bcd_ctx.simd );
 #else
   init_sd( &x13bcd_ctx.simd, 512 );
 #endif
   sm3_init( &x13bcd_ctx.sm3 );
   sph_hamsi512_init( &x13bcd_ctx.hamsi );
 };
@@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                                   (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -17,11 +17,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -46,11 +42,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
   sm3_ctx_t               sm3;
   sph_hamsi512_context    hamsi;
   sph_fugue512_context    fugue;
@@ -75,11 +67,6 @@ void init_x13sm3_ctx()
   init_luffa( &hsr_ctx.luffa,512 );
   cubehashInit( &hsr_ctx.cube,512,16,32 );
   sph_shavite512_init( &hsr_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &hsr_ctx.simd );
 #else
   init_sd( &hsr_ctx.simd,512 );
 #endif
   sm3_init( &hsr_ctx.sm3 );
   sph_hamsi512_init( &hsr_ctx.hamsi );
   sph_fugue512_init( &hsr_ctx.fugue );
@@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                                   (const BitSequence *)hash, 512 );
 #endif
        //11---echo---
 #ifdef __AES__
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -15,11 +15,7 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -49,11 +45,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cube;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
   sph_hamsi512_context    hamsi;
   sph_shabal512_context   shabal;
 } x14_ctx_holder;
@@ -79,11 +71,6 @@ void init_x14_ctx()
   init_luffa( &x14_ctx.luffa,512 );
   cubehashInit( &x14_ctx.cube,512,16,32 );
   sph_shavite512_init( &x14_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &x14_ctx.simd );
 #else
   init_sd( &x14_ctx.simd, 512 );
 #endif
   sph_hamsi512_init( &x14_ctx.hamsi );
   sph_shabal512_init( &x14_ctx.shabal );
 };
@@ -124,13 +111,7 @@ void x14hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                                   (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -17,12 +17,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
  #include "algo/groestl/aes_ni/hash-groestl.h"
@@ -52,11 +47,7 @@ typedef struct {
   hashState_luffa         luffa;
   cubehashParam           cubehash;
   sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+   simd512_context         simd;
   sph_simd512_context     simd;
 #else
   hashState_sd            simd;
 #endif
   sph_hamsi512_context    hamsi;
   sph_shabal512_context   shabal;
   sph_whirlpool_context   whirlpool;
@@ -83,11 +74,6 @@ void init_x15_ctx()
   init_luffa( &x15_ctx.luffa,512 );
   cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
   sph_shavite512_init( &x15_ctx.shavite );
 #if defined(__aarch64__)
   sph_simd512_init( &x15_ctx.simd );
 #else
   init_sd( &x15_ctx.simd, 512 );
 #endif
   sph_hamsi512_init( &x15_ctx.hamsi );
   sph_shabal512_init( &x15_ctx.shabal );
   sph_whirlpool_init( &x15_ctx.whirlpool );
@@ -131,13 +117,7 @@ void x15hash(void *output, const void *input)
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                                   (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   do
   {
      edata[19] = nonce;
-      if ( hex_hash( hash32, edata, thr_id ) );
+      if ( hex_hash( hash32, edata, thr_id ) )
      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
         be32enc( &pdata[19], nonce );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( x16r_8way_hash( hash, vdata, thr_id ) );
+      if ( x16r_8way_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x16r_4way_hash( hash, vdata, thr_id ) );
+      if ( x16r_4way_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x16r_2x64_hash( hash, vdata, thr_id ) );
+      if ( x16r_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -15,7 +15,6 @@
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/sph_simd.h"
 #include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
--- a/algo/x16/x20r.c
+++ b/algo/x16/x20r.c
@@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      if( x20r_8x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_8x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x20r_4x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_4x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
@@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
   *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
   do
   {
-      if ( x20r_2x64_hash( hash, vdata, thr_id ) );
+      if ( x20r_2x64_hash( hash, vdata, thr_id ) )
      for ( int i = 0; i < 2; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -18,11 +18,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
@@ -53,11 +49,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
        sph_simd512_context     simd;
 #else
        hashState_sd            simd;
 #endif
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -86,11 +78,6 @@ void init_sonoa_ctx()
        init_luffa( &sonoa_ctx.luffa, 512 );
        cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &sonoa_ctx.shavite );
 #if defined(__aarch64__)
        sph_simd512_init( &sonoa_ctx.simd );
 #else
        init_sd( &sonoa_ctx.simd, 512 );
 #endif
        sph_hamsi512_init( &sonoa_ctx.hamsi );
        sph_shabal512_init( &sonoa_ctx.shabal );
        sph_whirlpool_init( &sonoa_ctx.whirlpool );
@@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
 	sph_shavite512(&ctx.shavite, hash, 64);
 	sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                       (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
   update_final_echo ( &ctx.echo, (BitSequence *)hash,
@@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
   sph_simd512(&ctx.simd, (const void*) hash, 64);
   sph_simd512_close(&ctx.simd, hash);
 #else
   update_final_sd( &ctx.simd, (BitSequence *)hash,
                       (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
   sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
   update_final_sd( &ctx.simd, (BitSequence *)hash,
                       (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                       (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512_init( &ctx.simd );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    simd_full( &ctx.simd, hash, hash, 512 );
 #endif
 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512_init( &ctx.simd );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    simd_full( &ctx.simd, hash, hash, 512 );
 #endif
 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
@@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_whirlpool_close(&ctx.whirlpool, hash);
   if ( work_restart[thr_id].restart ) return 0;
 //
   sph_bmw512_init( &ctx.bmw);
   sph_bmw512(&ctx.bmw, hash, 64);
@@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, 64);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, 64 );
    sph_simd512_init( &ctx.simd );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    simd_full( &ctx.simd, hash, hash, 512 );
 #endif
 #if defined(__AES__)
   init_echo( &ctx.echo, 512 );
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -18,11 +18,7 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
  #include "algo/fugue/fugue-aesni.h"
@@ -34,7 +30,7 @@
  #include "algo/fugue/sph_fugue.h"
 #endif
 #include "algo/blake/sph_blake.h"
-#include "algo/cubehash/sph_cubehash.h"
+//#include "algo/cubehash/sph_cubehash.h"
 #include "algo/luffa/sph_luffa.h"
@@ -63,17 +59,9 @@ union _x17_context_overlay
 #else
        hashState_luffa         luffa;
 #endif
 //#if defined(__aarch64__)
 //        sph_cubehash512_context    cube;
 //#else
        cubehashParam           cube;
 //#endif
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
        sph_simd512_context     simd;
 #else
        hashState_sd            simd;
 #endif
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id )
    luffa_full( &ctx.luffa, hash, 512, hash, 64 );
 #endif
 //#if defined(__aarch64__)
 //    sph_cubehash512_init(&ctx.cube);
 //    sph_cubehash512(&ctx.cube, (const void*) hash, 64);
 //    sph_cubehash512_close(&ctx.cube, hash);
 //#else
    cubehash_full( &ctx.cube, hash, 512, hash, 64 );
 //#endif
    sph_shavite512_init( &ctx.shavite );
    sph_shavite512( &ctx.shavite, hash, 64);
    sph_shavite512_close( &ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, 64 );        
    sph_simd512_init( &ctx.simd );
    sph_simd512(&ctx.simd, (const void*) hash, 64);
    sph_simd512_close(&ctx.simd, hash);
 #else
    simd_full( &ctx.simd, (BitSequence *)hash,
               (const BitSequence *)hash, 512 );
 #endif
 #if defined(__AES__)
    echo_full( &ctx.echo, (BitSequence *)hash, 512,
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -17,11 +17,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/sph-haval.h"
-#if defined(__aarch64__)
+#include "algo/simd/simd-hash-2way.h"
  #include "algo/simd/sph_simd.h"
 #else
  #include "algo/simd/nist.h"
 #endif
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/sha/sph_sha2.h"
 #if defined(__AES__)
@@ -45,11 +41,7 @@ typedef struct {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
-#if defined(__aarch64__)
+        simd512_context         simd;
        sph_simd512_context     simd;
 #else
        hashState_sd            simd;
 #endif
        sph_hamsi512_context    hamsi;
        sph_shabal512_context   shabal;
        sph_whirlpool_context   whirlpool;
@@ -78,11 +70,6 @@ void init_xevan_ctx()
        init_luffa( &xevan_ctx.luffa, 512 );
        cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
        sph_shavite512_init( &xevan_ctx.shavite );
 #if defined(__aarch64__)
        sph_simd512_init( &xevan_ctx.simd );
 #else
        init_sd( &xevan_ctx.simd, 512 );
 #endif
        sph_hamsi512_init( &xevan_ctx.hamsi );
        sph_shabal512_init( &xevan_ctx.shabal );
        sph_whirlpool_init( &xevan_ctx.whirlpool );
@@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, dataLen);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+   simd512_ctx( &ctx.simd, hash, hash, dataLen );
    sph_simd512( &ctx.simd, (const void*) hash, dataLen );
    sph_simd512_close( &ctx.simd, hash );
 #else
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hash, dataLen*8 );
 #endif
 #if defined(__AES__)
   update_final_echo( &ctx.echo, (BitSequence *) hash,
@@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id )
   sph_shavite512(&ctx.shavite, hash, dataLen);
   sph_shavite512_close(&ctx.shavite, hash);
-#if defined(__aarch64__)
+    simd512_ctx( &ctx.simd, hash, hash, dataLen );
-    sph_simd512(&ctx.simd, (const void*) hash, 64);
+//#if defined(__aarch64__)
-    sph_simd512_close(&ctx.simd, hash);
+//    sph_simd512(&ctx.simd, (const void*) hash, 64);
-#else
+//    sph_simd512_close(&ctx.simd, hash);
-   update_final_sd( &ctx.simd, (BitSequence *)hash,
+//#else
-                         (const BitSequence *)hash, dataLen*8 );
+//   update_final_sd( &ctx.simd, (BitSequence *)hash,
-#endif
+//                         (const BitSequence *)hash, dataLen*8 );
 //#endif
 #if defined(__AES__)
   update_final_echo( &ctx.echo, (BitSequence *) hash,
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -18,7 +18,6 @@
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/whirlpool/sph_whirlpool.h"
--- a/61
+++ b/61
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.8.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.8'
+PACKAGE_VERSION='25.1'
-PACKAGE_STRING='cpuminer-opt 24.8'
+PACKAGE_STRING='cpuminer-opt 25.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -657,6 +657,10 @@ JANSSON_LIBS
 LIBCURL_CPPFLAGS
 LIBCURL_CFLAGS
 LIBCURL
 X86_64_APPLE_FALSE
 X86_64_APPLE_TRUE
 ARM64_APPLE_FALSE
 ARM64_APPLE_TRUE
 HAVE_APPLE_FALSE
 HAVE_APPLE_TRUE
 MINGW_FALSE
@@ -1362,7 +1366,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 24.8 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1434,7 +1438,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.8:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
   esac
  cat <<\_ACEOF
@@ -1540,7 +1544,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 24.8
+cpuminer-opt configure 25.1
 generated by GNU Autoconf 2.71
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1987,7 +1991,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 24.8, which was
+It was created by cpuminer-opt $as_me 25.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  $ $0$ac_configure_args_raw
@@ -3595,7 +3599,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='24.8'
+ VERSION='25.1'
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6508,11 +6512,19 @@ case $target in
  i*86-*-*)
    have_x86=true
    ;;
  aarch64-apple-*|arm64-apple-*)
    have_arm64=true
    have_arm64_apple=true
    ;;
  x86_64-apple-*|amd64-apple-*)
    have_x86_64=true
    have_x86_64_apple=true
    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
  aarch64*-*-*|arm64*-*-*)
-    have_aarch64=true
+    have_arm4=true
    ;;
  powerpc*-*-*)
    have_ppc=true
@@ -6533,7 +6545,6 @@ case $target in
    ;;
 esac
 # Check whether --enable-assembly was given.
 if test ${enable_assembly+y}
 then :
@@ -6946,7 +6957,7 @@ else
  ARCH_x86_64_FALSE=
 fi
- if test x$have_aarch64 = xtrue; then
+ if test x$have_arm64 = xtrue; then
  ARCH_ARM64_TRUE=
  ARCH_ARM64_FALSE='#'
 else
@@ -6970,6 +6981,22 @@ else
  HAVE_APPLE_FALSE=
 fi
 if test x$have_arm64_apple = xtrue; then
  ARM64_APPLE_TRUE=
  ARM64_APPLE_FALSE='#'
 else
  ARM64_APPLE_TRUE='#'
  ARM64_APPLE_FALSE=
 fi
 if test x$have_x86_64_apple = xtrue; then
  X86_64_APPLE_TRUE=
  X86_64_APPLE_FALSE='#'
 else
  X86_64_APPLE_TRUE='#'
  X86_64_APPLE_FALSE=
 fi
 if test x$request_jansson = xtrue ; then
 	JANSSON_LIBS="compat/jansson/libjansson.a"
@@ -7213,6 +7240,14 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -7603,7 +7638,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.8, which was
+This file was extended by cpuminer-opt $as_me 25.1, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -7671,7 +7706,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.8
+cpuminer-opt config.status 25.1
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [24.8])
+AC_INIT([cpuminer-opt], [25.1])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -45,11 +45,19 @@ case $target in
  i*86-*-*)
    have_x86=true
    ;;
  aarch64-apple-*|arm64-apple-*)
    have_arm64=true
    have_arm64_apple=true
    ;;
  x86_64-apple-*|amd64-apple-*)
    have_x86_64=true
    have_x86_64_apple=true
    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
  aarch64*-*-*|arm64*-*-*)
-    have_aarch64=true
+    have_arm4=true
    ;;
  powerpc*-*-*)
    have_ppc=true
@@ -70,7 +78,6 @@ case $target in
    ;;
 esac
 AC_ARG_ENABLE([assembly],
  AS_HELP_STRING([--disable-assembly], [disable assembly-language routines]))
 if test x$enable_assembly != xno; then
@@ -138,9 +145,11 @@ AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
 AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
 AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
 AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
-AM_CONDITIONAL([ARCH_ARM64], [test x$have_aarch64 = xtrue])
+AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
 AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
 AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])
 AM_CONDITIONAL([ARM64_APPLE], [test x$have_arm64_apple = xtrue])
 AM_CONDITIONAL([X86_64_APPLE], [test x$have_x86_64_apple = xtrue])
 if test x$request_jansson = xtrue ; then
 	JANSSON_LIBS="compat/jansson/libjansson.a"
--- a/61
+++ b/61
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.72 for cpuminer-opt 24.8.
+# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='24.8'
+PACKAGE_VERSION='25.1'
-PACKAGE_STRING='cpuminer-opt 24.8'
+PACKAGE_STRING='cpuminer-opt 25.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -650,6 +650,10 @@ JANSSON_LIBS
 LIBCURL_CPPFLAGS
 LIBCURL_CFLAGS
 LIBCURL
 X86_64_APPLE_FALSE
 X86_64_APPLE_TRUE
 ARM64_APPLE_FALSE
 ARM64_APPLE_TRUE
 HAVE_APPLE_FALSE
 HAVE_APPLE_TRUE
 MINGW_FALSE
@@ -1355,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-'configure' configures cpuminer-opt 24.8 to adapt to many kinds of systems.
+'configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1427,7 +1431,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 24.8:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
   esac
  cat <<\_ACEOF
@@ -1532,7 +1536,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 24.8
+cpuminer-opt configure 25.1
 generated by GNU Autoconf 2.72
 Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1953,7 +1957,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 24.8, which was
+It was created by cpuminer-opt $as_me 25.1, which was
 generated by GNU Autoconf 2.72.  Invocation command line was
  $ $0$ac_configure_args_raw
@@ -3768,7 +3772,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='24.8'
+ VERSION='25.1'
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6463,11 +6467,19 @@ case $target in
  i*86-*-*)
    have_x86=true
    ;;
  aarch64-apple-*|arm64-apple-*)
    have_arm64=true
    have_arm64_apple=true
    ;;
  x86_64-apple-*|amd64-apple-*)
    have_x86_64=true
    have_x86_64_apple=true
    ;;
  x86_64-*-*|amd64-*-*)
    have_x86_64=true
    ;;
  aarch64*-*-*|arm64*-*-*)
-    have_aarch64=true
+    have_arm4=true
    ;;
  powerpc*-*-*)
    have_ppc=true
@@ -6488,7 +6500,6 @@ case $target in
    ;;
 esac
 # Check whether --enable-assembly was given.
 if test ${enable_assembly+y}
 then :
@@ -6950,7 +6961,7 @@ else
  ARCH_x86_64_FALSE=
 fi
- if test x$have_aarch64 = xtrue; then
+ if test x$have_arm64 = xtrue; then
  ARCH_ARM64_TRUE=
  ARCH_ARM64_FALSE='#'
 else
@@ -6974,6 +6985,22 @@ else
  HAVE_APPLE_FALSE=
 fi
 if test x$have_arm64_apple = xtrue; then
  ARM64_APPLE_TRUE=
  ARM64_APPLE_FALSE='#'
 else
  ARM64_APPLE_TRUE='#'
  ARM64_APPLE_FALSE=
 fi
 if test x$have_x86_64_apple = xtrue; then
  X86_64_APPLE_TRUE=
  X86_64_APPLE_FALSE='#'
 else
  X86_64_APPLE_TRUE='#'
  X86_64_APPLE_FALSE=
 fi
 if test x$request_jansson = xtrue ; then
 	JANSSON_LIBS="compat/jansson/libjansson.a"
@@ -7229,6 +7256,14 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
  as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
@@ -7622,7 +7657,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 24.8, which was
+This file was extended by cpuminer-opt $as_me 25.1, which was
 generated by GNU Autoconf 2.72.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -7690,7 +7725,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 24.8
+cpuminer-opt config.status 25.1
 configured by $0, generated by GNU Autoconf 2.72,
  with options \\"\$ac_cs_config\\"
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -29,7 +29,6 @@
 //    is no significant 64 bit vectorization therefore SSE2 is the practical
 //    minimum for using this code.
 //
 //    MMX:     64 bit vectors  (Not used in cpuminer-opt) 
 //    SSE2:   128 bit vectors  (64 bit CPUs only, such as Intel Core2.
 //    AVX2:   256 bit vectors  (Starting with Intel Haswell and AMD Ryzen)
 //    AVX512: 512 bit vectors  (Starting with SkylakeX)
@@ -217,9 +216,6 @@
 #include "simd-utils/simd-int.h"
 // x86_64 MMX 64 bit vectors
 #include "simd-utils/simd-64.h"
 // x86_64 SSE2 128 bit vectors
 #include "simd-utils/simd-128.h"
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -1,193 +0,0 @@
 #if !defined(SIMD_64_H__)
 #define SIMD_64_H__ 1
 #if defined(__x86_64__) && defined(__MMX__)
 ////////////////////////////////////////////////////////////////
 //
 //               64 bit MMX vectors.
 //
 // This code is not used anywhere annd likely never will. It's intent was
 // to support 2 way parallel hashing using  MMX, or NEON for 32 bit hash
 // functions, but hasn't been implementedwas never implemented.
 //
 // MMX is being deprecated by compilers, all intrinsics will be converted to use SSE
 // registers and instructions. MMX will still be available using ASM.
 // For backward compatibility it's likely the compiler won't allow mixing explicit SSE
 // with promoted MMX. It is therefore preferable to implement all 64 bit vector code
 // using explicit SSE with the upper 64 bits being ignored.
 // Using SSE for 64 bit vectors will complicate loading arrays from memory which will
 // always load 128 bits. Odd indexes will need to be extracted from the upper 64 bits
 // of the even index SSE register. 
 // In most cases the exiting 4x32 SSE code can be used with 2 lanes being ignored
 // making ths file obsolete.
 #define v64_t                        __m64
 #define v64u32_t                     v64_t
 #define v64_load                      _mm_load_si64
 #define v64_store                     _mm_store_si64
 #define v64_64(i64)                   ((__m64)(i64))
 #define v64_32                        _mm_set1_pi32
 #define v64_16                        _mm_set1_pi16
 #define v64_8                         _mm_set1_pi8
 #define v64_add32                     _mm_add_pi32
 #define v64_add16                     _mm_add_pi16
 #define v64_add8                      _mm_add_pi8
 #define v64_mul32                     _mm_mullo_pi32
 #define v64_mul16                     _mm_mullo_pi16
 // compare
 #define v64_cmpeq32                   _mm_cmpeq_epi32
 #define v64_cmpeq16                   _mm_cmpeq_epi16
 #define v64_cmpeq8                    _mm_cmpeq_epi8
 #define v64_cmpgt32                   _mm_cmpgt_epi32
 #define v64_cmpgt16                   _mm_cmpgt_epi16
 #define v64_cmpgt8                    _mm_cmpgt_epi8
 #define v64_cmplt32                   _mm_cmplt_epi32
 #define v64_cmplt16                   _mm_cmplt_epi16
 #define v64_cmplt8                    _mm_cmplt_epi8
 // bit shift
 #define v64_sl32                      _mm_slli_epi32
 #define v64_sl16                      _mm_slli_epi16
 #define v64_sl8                       _mm_slli_epi8
 #define v64_sr32                      _mm_srli_epi32
 #define v64_sr16                      _mm_srli_epi16
 #define v64_sr8                       _mm_srli_epi8
 #define v64_sra32                     _mm_srai_epi32
 #define v64_sra16                     _mm_srai_epi16
 #define v64_sra8                      _mm_srai_epi8
 #define v64_alignr8                   _mm_alignr_pi8
 #define v64_unpacklo32                _mm_unpacklo_pi32
 #define v64_unpackhi32                _mm_unpackhi_pi32
 #define v64_unpacklo16                _mm_unpacklo_pi16
 #define v64_unpackhi16                _mm_unpacklhi_pi16
 #define v64_unpacklo8                 _mm_unpacklo_pi8
 #define v64_unpackhi8                 _mm_unpackhi_pi16
 // Pseudo constants
 #define v64_zero        _mm_setzero_si64()
 #define v64_one_64      _mm_set_pi32(  0UL, 1UL )
 #define v64_one_32      v64_32( 1UL )
 #define v64_one_16      v64_16( 1U )
 #define v64_one_8       v64_8(  1U );
 #define v64_neg1        v64_32( 0xFFFFFFFFUL )
 #define casti_v64(p,i) (((v64_t*)(p))[(i)])
 // Bitwise not: ~(a)
 //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
 #define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) )
 /*      
 // Unary negate elements
 #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
 #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
 #define mm64_negate_8(  v ) _mm_sub_pi8(  m64_zero, v )
 */
 static inline void v64_memset_zero( __m64 *dst,  const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; }
 static inline void v64_memset( __m64 *dst, const __m64 a, const int n )
 {   for ( int i = 0; i < n; i++ ) dst[i] = a; }
 static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
 #define v64_or                       _mm_or_si64
 #define v64_and                      _mm_and_si64
 #define v64_xor                      _mm_xor_si64
 #define v64_andnot                   _mm_andnot_si64
 #define v64_xor3( v2, v1, v0 )       v64_xor( v2, v64_andnot( v1, v0 ) )
 #define v64_xorandnot( v2, v1, v0 )  v64_xor( v2, v64_andnot( v1, v0 ) )
 // Rotate bits in packed elements of 64 bit vector
 #define v64_rol64( a, n ) \
   _mm_or_si64( _mm_slli_si64( a, n ), \
                _mm_srli_si64( a, 64-(n) ) )
 #define v64_ror64( a, n ) \
   _mm_or_si64( _mm_srli_si64( a, n ), \
                _mm_slli_si64( a, 64-(n) ) )
 #define v64_rol32( a, n ) \
   _mm_or_si64( _mm_slli_pi32( a, n ), \
                _mm_srli_pi32( a, 32-(n) ) )
 #define v64_ror32( a, n ) \
   _mm_or_si64( _mm_srli_pi32( a, n ), \
                _mm_slli_pi32( a, 32-(n) ) )
 #define v64_rol16( a, n ) \
   _mm_or_si64( _mm_slli_pi16( a, n ), \
                _mm_srli_pi16( a, 16-(n) ) )
 #define v64_ror16( a, n ) \
   _mm_or_si64( _mm_srli_pi16( a, n ), \
                _mm_slli_pi16( a, 16-(n) ) )
 // Rotate packed elements accross lanes. Useful for byte swap and byte
 // rotation.
 #if defined(__SSE__)
 // Swap hi & lo 32 bits.
 #define v64_swap32( a )      _mm_shuffle_pi16( a, 0x4e )
 #define v64_shulfr16( a )     _mm_shuffle_pi16( a, 0x39 ) 
 #define v64_shufll16( a )     _mm_shuffle_pi16( a, 0x93 ) 
 // Swap hi & lo 16 bits of each 32 bit element
 #define v64_swap32_16( a )    _mm_shuffle_pi16( a, 0xb1 )
 #endif   // SSE
 #if defined(__SSSE3__)
 // Endian byte swap packed elements
 #define v64_bswap32( v ) \
    _mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
 #define v64_bswap16( v ) \
    _mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
 // Rotate right by c bytes
 static inline v64_t v64_shuflr_x8( __m64 v, const int c )
 { return _mm_alignr_pi8( v, v, c ); }
 #else
 #define v64_bswap32( v ) \
   _mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
                 __builtin_bswap32( ((uint32_t*)&v)[0] )  )
 #define v64_bswap16( v ) \
   _mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
                 __builtin_bswap16( ((uint16_t*)&v)[2] ), \
                 __builtin_bswap16( ((uint16_t*)&v)[1] ), \
                 __builtin_bswap16( ((uint16_t*)&v)[0] )  )
 #endif   // SSSE3
 #define v64_blendv( v1, v0, mask ) \
   v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) )
 #endif // MMX
 #endif // SIMD_64_H__
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -19,6 +19,9 @@ static inline uint64_t bswap_64( uint64_t a )
   return b;
 }
 // This produces warnings from clang, but its suggested workaround 
 // "rev32 %w0, %w1\n\t" produced errors instead. GCC doesn't complain and
 // it works as is on both.
 static inline uint32_t bswap_32( uint32_t a )
 {
   uint32_t b;