Initial upload v3.4.7

2026-02-22 16:33:08 +00:00 · 2016-09-22 13:16:18 -04:00
parent a3c8079774
commit a35039bc05
480 changed files with 211015 additions and 3 deletions
--- a/algo/simd/sse2/.dirstamp
+++ b/algo/simd/sse2/.dirstamp
--- a/algo/simd/sse2/compat.h
+++ b/algo/simd/sse2/compat.h
@@ -0,0 +1,205 @@
+#ifndef __COMPAT_H__
+#define __COMPAT_H__
+
+#include <limits.h>
+
+
+/* 
+ * This file desfines some helper function for cross-platform compatibility.
+ */
+
+#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
+#define GNU_EXT
+#endif
+
+/*
+ * First define some integer types.
+ */
+
+#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+
+/*
+ * On C99 implementations, we can use <stdint.h> to get an exact 32-bit
+ * type, if any, or otherwise use a wider type.
+ */
+
+#include <stdint.h>
+
+#ifdef UINT32_MAX
+typedef uint32_t u32;
+#else
+typedef uint_fast32_t u32;
+#endif
+
+typedef unsigned long long u64;
+
+#define C32(x)    ((u32)(x))
+
+#define HAS_64  1
+
+#else
+
+/*
+ * On non-C99 systems, we use "unsigned int" if it is wide enough,
+ * "unsigned long" otherwise. This supports all "reasonable" architectures.
+ * We have to be cautious: pre-C99 preprocessors handle constants
+ * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
+ */
+
+#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
+
+typedef unsigned int u32;
+
+#define C32(x)    ((u32)(x ## U))
+
+#else
+
+typedef unsigned long u32;
+
+#define C32(x)    ((u32)(x ## UL))
+
+#endif
+
+/*
+ * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
+ * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
+ * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
+ * test whether "unsigned long long" is available; we also know that
+ * gcc features this type, even if the libc header do not know it.
+ */
+
+#if ((ULONG_MAX >> 31) >> 31) >= 3
+
+typedef unsigned long u64;
+
+#define HAS_64  1
+
+#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
+
+typedef unsigned long long u64;
+
+#define HAS_64  1
+
+#else
+
+/*
+ * No 64-bit type...
+ */
+
+#endif
+
+#endif
+
+
+/*
+ * fft_t should be at least 16 bits wide.
+ * using short int will require less memory, but int is faster...
+ */
+
+typedef int fft_t;
+
+
+/*
+ * Implementation note: some processors have specific opcodes to perform
+ * a rotation. Recent versions of gcc recognize the expression above and
+ * use the relevant opcodes, when appropriate.
+ */
+
+#define T32(x)    ((x) & C32(0xFFFFFFFF))
+#define ROTL32(x, n)   T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n)   ROTL32(x, (32 - (n)))
+
+
+
+/*
+ * The macro MAYBE_INLINE expands to an inline qualifier, is available.
+ */
+
+#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
+#define MAYBE_INLINE static inline
+#elif defined _MSC_VER
+#define MAYBE_INLINE __inline
+#else
+#define MAYBE_INLINE
+#endif
+
+
+/*  */
+
+#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
+
+#define rdtsc()                                                         \
+  ({                                                                    \
+    u32 lo, hi;                                                         \
+    __asm__ __volatile__ (      /* serialize */                         \
+                          "xorl %%eax,%%eax \n        cpuid"            \
+                          ::: "%rax", "%rbx", "%rcx", "%rdx");          \
+    /* We cannot use "=A", since this would use %rax on x86_64 */       \
+    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));              \
+    (u64)hi << 32 | lo;                                                 \
+  })                                                                    \
+
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+
+#define rdtsc __rdtsc
+
+#endif
+
+/* 
+ * The IS_ALIGNED macro tests if a char* pointer is aligned to an
+ * n-bit boundary.
+ * It is defined as false on unknown architectures.
+ */
+
+
+#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
+
+#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
+/*
+ * Unaligned 32-bit access are not expensive on x86 so we don't care
+ */
+#define IS_ALIGNED(p,n)    (n<=4 || CHECK_ALIGNED(p,n))
+
+#elif defined __sparcv9 || defined __sparc || defined __arm || \
+      defined __ia64 || defined __ia64__ || \
+      defined __itanium__ || defined __M_IA64 || \
+      defined __powerpc__ || defined __powerpc
+#define IS_ALIGNED(p,n)    CHECK_ALIGNED(p,n)
+
+#else
+/* 
+ * Unkonwn architecture: play safe
+ */
+#define IS_ALIGNED(p,n)    0
+#endif
+
+
+
+/* checks for endianness */
+
+#if defined (__linux__) || defined (__GLIBC__)
+#  include <endian.h>
+#elif defined (__FreeBSD__)
+#  include <machine/endian.h> 
+#elif defined (__OpenBSD__)
+#  include <sys/endian.h>
+#endif
+
+#ifdef __BYTE_ORDER
+
+#  if __BYTE_ORDER == __LITTLE_ENDIAN
+#    define SIMD_LITTLE_ENDIAN
+#  elif __BYTE_ORDER == __BIG_ENDIAN
+#    define SIMD_BIG_ENDIAN
+#  endif
+
+#else
+
+#  if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
+#    define SIMD_LITTLE_ENDIAN
+#  endif
+
+#endif
+
+
+#endif
--- a/algo/simd/sse2/defs_x5.h
+++ b/algo/simd/sse2/defs_x5.h
@@ -0,0 +1,23 @@
+
+#ifndef DEFS_X5_H__
+#define DEFS_X5_H__
+#include <emmintrin.h>
+typedef unsigned char BitSequence;
+typedef unsigned long long DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
+
+typedef unsigned char uint8;
+typedef unsigned int uint32;
+typedef unsigned long long uint64;
+
+typedef struct {
+    uint32 buffer[8]; /* Buffer to be hashed */
+    __m128i chainv[10];   /* Chaining values */
+    uint64 bitlen[2]; /* Message length in bits */
+    uint32 rembitlen; /* Length of buffer data to be hashed */
+    int hashbitlen;
+} hashState_luffa;
+
+
+typedef unsigned char byte;
+#endif
--- a/algo/simd/sse2/nist.c
+++ b/algo/simd/sse2/nist.c
@@ -0,0 +1,269 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "nist.h"
+#include "simd_iv.h"
+
+
+/* #define NO_PRECOMPUTED_IV */
+
+
+/* 
+ * Increase the counter.
+ */
+void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
+#ifdef HAS_64
+      state->count += databitlen;
+#else
+      u32 old_count = state->count_low;
+      state->count_low += databitlen;
+      if (state->count_low < old_count)
+        state->count_high++;
+#endif
+}
+
+
+/* 
+ * Initialize the hashState_sd with a given IV.
+ * If the IV is NULL, initialize with zeros.
+ */
+HashReturn InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
+
+  int n;
+
+  if (!SupportedLength(hashbitlen))
+    return BAD_HASHBITLEN;
+
+  n =  8;
+
+  state->hashbitlen = hashbitlen;
+  state->n_feistels = n;
+  state->blocksize = 128*8;
+  
+#ifdef HAS_64
+  state->count = 0;
+#else
+  state->count_low  = 0;
+  state->count_high = 0;
+#endif  
+
+//  state->buffer = malloc(16*n + 16);
+  /*
+   * Align the buffer to a 128 bit boundary.
+   */
+//  state->buffer += ((unsigned char*)NULL - state->buffer)&15;
+
+//  state->A = malloc((4*n+4)*sizeof(u32));
+  /*
+   * Align the buffer to a 128 bit boundary.
+   */
+//  state->A += ((u32*)NULL - state->A)&3;
+
+  state->B = state->A+n;
+  state->C = state->B+n;
+  state->D = state->C+n;
+
+  if (IV)
+    memcpy(state->A, IV, 4*n*sizeof(u32));
+  else
+    memset(state->A, 0, 4*n*sizeof(u32));
+
+   // free(state->buffer);
+  //  free(state->A);	
+  return SUCCESS;
+  
+}
+
+/* 
+ * Initialize the hashState_sd.
+ */
+HashReturn init_sd(hashState_sd *state, int hashbitlen) {
+  HashReturn r;
+  char *init;
+
+#ifndef NO_PRECOMPUTED_IV
+  if (hashbitlen == 224)
+    r=InitIV(state, hashbitlen, IV_224);
+  else if (hashbitlen == 256)
+    r=InitIV(state, hashbitlen, IV_256);
+  else if (hashbitlen == 384)
+    r=InitIV(state, hashbitlen, IV_384);
+  else if (hashbitlen == 512)
+    r=InitIV(state, hashbitlen, IV_512);
+  else
+#endif
+    {
+      /* 
+       * Nonstandart length: IV is not precomputed.
+       */
+      r=InitIV(state, hashbitlen, NULL);
+      if (r != SUCCESS)
+        return r;
+      
+      init = malloc(state->blocksize);
+      memset(init, 0, state->blocksize);
+#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+      snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
+#else
+      sprintf(init, "SIMD-%i v1.1", hashbitlen);
+#endif
+      SIMD_Compress(state, (unsigned char*) init, 0);
+      free(init);
+    }
+  return r;
+}
+
+
+
+HashReturn update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen) {
+  unsigned current;
+  unsigned int bs = state->blocksize;
+  static int align = -1;
+
+  if (align == -1)
+    align = RequiredAlignment();
+
+#ifdef HAS_64
+  current = state->count & (bs - 1);
+#else
+  current = state->count_low & (bs - 1);
+#endif
+  
+  if (current & 7) {
+    /*
+     * The number of hashed bits is not a multiple of 8.
+     * Very painfull to implement and not required by the NIST API.
+     */
+    return FAIL;
+  }
+
+  while (databitlen > 0) {
+    if (IS_ALIGNED(data,align) && current == 0 && databitlen >= bs) {
+      /* 
+       * We can hash the data directly from the input buffer.
+       */
+      SIMD_Compress(state, data, 0);
+      databitlen -= bs;
+      data += bs/8;
+      IncreaseCounter(state, bs);
+    } else {
+      /* 
+       * Copy a chunk of data to the buffer
+       */
+      unsigned int len = bs - current;
+      if (databitlen < len) {
+        memcpy(state->buffer+current/8, data, (databitlen+7)/8);
+        IncreaseCounter(state, databitlen);        
+        return SUCCESS;
+      } else {
+        memcpy(state->buffer+current/8, data, len/8);
+        IncreaseCounter(state,len);
+        databitlen -= len;
+        data += len/8;
+        current = 0;
+        SIMD_Compress(state, state->buffer, 0);
+      }
+    }
+  }
+  return SUCCESS;
+}
+
+HashReturn final_sd(hashState_sd *state, BitSequence *hashval) {
+#ifdef HAS_64
+  u64 l;
+  int current = state->count & (state->blocksize - 1);
+#else
+  u32 l;
+  int current = state->count_low & (state->blocksize - 1);
+#endif
+  unsigned int i;
+  BitSequence bs[64];
+  int isshort = 1;
+
+  /* 
+   * If there is still some data in the buffer, hash it
+   */
+  if (current) {
+    /* 
+     * We first need to zero out the end of the buffer.
+     */
+    if (current & 7) {
+      BitSequence mask = 0xff >> (current&7);
+      state->buffer[current/8] &= ~mask;
+    }
+    current = (current+7)/8;
+    memset(state->buffer+current, 0, state->blocksize/8 - current);
+    SIMD_Compress(state, state->buffer, 0);
+  }
+
+  /* 
+   * Input the message length as the last block
+   */
+  memset(state->buffer, 0, state->blocksize/8);
+#ifdef HAS_64
+  l = state->count;
+  for (i=0; i<8; i++) {
+    state->buffer[i] = l & 0xff;
+    l >>= 8;
+  }
+  if (state->count < 16384)
+    isshort = 2;
+#else
+  l = state->count_low;
+  for (i=0; i<4; i++) {
+    state->buffer[i] = l & 0xff;
+    l >>= 8;
+  }
+  l = state->count_high;
+  for (i=0; i<4; i++) {
+    state->buffer[4+i] = l & 0xff;
+    l >>= 8;
+  }
+  if (state->count_high == 0 && state->count_low < 16384)
+    isshort = 2;
+#endif
+
+  SIMD_Compress(state, state->buffer, isshort);
+    
+
+  /*
+   * Decode the 32-bit words into a BitSequence
+   */
+  for (i=0; i<2*state->n_feistels; i++) {
+    u32 x = state->A[i];
+    bs[4*i  ] = x&0xff;
+    x >>= 8;
+    bs[4*i+1] = x&0xff;
+    x >>= 8;
+    bs[4*i+2] = x&0xff;
+    x >>= 8;
+    bs[4*i+3] = x&0xff;
+  }
+
+  memcpy(hashval, bs, state->hashbitlen/8);
+  if (state->hashbitlen%8) {
+    BitSequence mask = 0xff << (8 - (state->hashbitlen%8));
+    hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
+  }
+//free(state->buffer);
+//free(state->A);
+  return SUCCESS;
+}
+
+
+
+/*HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
+                BitSequence *hashval) {
+  hashState_sd s;
+  HashReturn r;
+  r = Init(&s, hashbitlen);
+  if (r != SUCCESS)
+    return r;
+  r = Update(&s, data, databitlen);
+  if (r != SUCCESS)
+    return r;
+  r = Final(&s, hashval);
+  return r;
+}
+*/
--- a/algo/simd/sse2/nist.h
+++ b/algo/simd/sse2/nist.h
@@ -0,0 +1,74 @@
+#ifndef __NIST_H__
+#define __NIST_H__
+
+/*define data alignment for different C compilers*/
+#if defined(__GNUC__)
+#define DATA_ALIGN(x) x __attribute__((aligned(16)))
+#else
+#define DATA_ALIGN(x) __declspec(align(16)) x
+#endif
+
+#include "compat.h"
+#include "algo/sha3/sha3-defs.h"
+/*
+ * NIST API Specific types.
+ */
+
+//typedef unsigned char BitSequence;
+
+//#ifdef HAS_64
+ // typedef u64 DataLength;
+//#else
+ // typedef unsigned long DataLength;
+//#endif
+
+// can't find u32 or fft-t
+#include <stdint.h>
+typedef uint32_t u32;
+typedef int fft_t;
+
+
+//typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
+
+typedef struct {
+  unsigned int hashbitlen;
+  unsigned int blocksize;
+  unsigned int n_feistels;
+
+#ifdef HAS_64
+  u64 count;
+#else
+  u32 count_low;
+  u32 count_high;
+#endif
+
+  DATA_ALIGN(u32 A[32]);
+  u32 *B;
+  u32 *C;
+  u32 *D;
+  DATA_ALIGN(unsigned char buffer[128]);
+  
+} hashState_sd;
+
+/* 
+ * NIST API
+ */
+
+HashReturn init_sd(hashState_sd *state, int hashbitlen);
+HashReturn update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
+HashReturn final_sd(hashState_sd *state, BitSequence *hashval);
+//HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
+//                BitSequence *hashval);
+
+/* 
+ * Internal API
+ */
+
+int SupportedLength(int hashbitlen);
+int RequiredAlignment(void);
+void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
+
+void fft128_natural(fft_t *a, unsigned char *x);
+void fft256_natural(fft_t *a, unsigned char *x);
+
+#endif
--- a/algo/simd/sse2/simd_iv.h
+++ b/algo/simd/sse2/simd_iv.h
@@ -0,0 +1,27 @@
+u32 IV_224[] = {
+  0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
+  0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
+  0x22e7b0af, 0xc862b3a8, 0x33e00cdc, 0x236b86a6,
+  0xf64ae77c, 0xfa373b76, 0x7dc1ee5b, 0x7fb29ce8
+};
+
+u32 IV_256[] = {
+  0x4d567983, 0x07190ba9, 0x8474577b, 0x39d726e9,
+  0xaaf3d925, 0x3ee20b03, 0xafd5e751, 0xc96006d3,
+  0xc2c2ba14, 0x49b3bcb4, 0xf67caf46, 0x668626c9,
+  0xe2eaa8d2, 0x1ff47833, 0xd0c661a5, 0x55693de1
+};
+
+u32 IV_384[] = {
+  0x8a36eebc, 0x94a3bd90, 0xd1537b83, 0xb25b070b, 0xf463f1b5, 0xb6f81e20, 0x0055c339, 0xb4d144d1,
+  0x7360ca61, 0x18361a03, 0x17dcb4b9, 0x3414c45a, 0xa699a9d2, 0xe39e9664, 0x468bfe77, 0x51d062f8,
+  0xb9e3bfe8, 0x63bece2a, 0x8fe506b9, 0xf8cc4ac2, 0x7ae11542, 0xb1aadda1, 0x64b06794, 0x28d2f462,
+  0xe64071ec, 0x1deb91a8, 0x8ac8db23, 0x3f782ab5, 0x039b5cb8, 0x71ddd962, 0xfade2cea, 0x1416df71
+};
+
+u32 IV_512[] = {
+  0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+  0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+  0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+  0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
+};
--- a/algo/simd/sse2/sph_types.h
+++ b/algo/simd/sse2/sph_types.h
--- a/algo/simd/sse2/vector.c
+++ b/algo/simd/sse2/vector.c
@@ -0,0 +1,927 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "nist.h"
+#include "vector.h"
+
+#define PRINT_SOME 0
+
+/* JDD all ocurrances of macro X in this file renamed to XX
+ * due to name conflict
+ */
+
+int SupportedLength(int hashbitlen) {
+  if (hashbitlen <= 0 || hashbitlen > 512)
+    return 0;
+  else
+    return 1;
+}
+
+int RequiredAlignment(void) {
+  return 16;
+}
+
+static const union cv V128 = CV(128);
+static const union cv V255 = CV(255);
+static const union cv V257 = CV(257);
+static const union cv8  V0 = CV(0);
+
+
+/*
+ * Reduce modulo 257; result is in [-127; 383]
+ * REDUCE(x) := (x&255) - (x>>8)
+ */
+#define REDUCE(x)                               \
+  v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
+
+/*
+ * Reduce from [-127; 383] to [-128; 128]
+ * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
+ */
+#define EXTRA_REDUCE_S(x)                       \
+  v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
+ 
+/*
+ * Reduce modulo 257; result is in [-128; 128]
+ */
+#define REDUCE_FULL_S(x)                        \
+  EXTRA_REDUCE_S(REDUCE(x))
+
+#define DO_REDUCE(i)                            \
+  X(i) = REDUCE(X(i))
+
+#define DO_REDUCE_FULL_S(i)                     \
+  do {                                          \
+    X(i) = REDUCE(X(i));                        \
+    X(i) = EXTRA_REDUCE_S(X(i));                \
+  } while(0)
+
+#define MAYBE_VOLATILE
+
+MAYBE_INLINE void fft64(void *a) {
+
+  v16* const A = a;
+
+  register v16 X0, X1, X2, X3, X4, X5, X6, X7;
+
+#if V16_SIZE == 8
+#define X(i) A[i]
+#elif V16_SIZE == 4
+#define X(i) A[2*i]
+#endif
+
+#define X(i) X##i
+
+  X0 = A[0];
+  X1 = A[1];
+  X2 = A[2];
+  X3 = A[3];
+  X4 = A[4];
+  X5 = A[5];
+  X6 = A[6];
+  X7 = A[7];
+
+#define DO_REDUCE(i)                            \
+  X(i) = REDUCE(X(i))
+
+  /*
+   * Begin with 8 parallels DIF FFT_8
+   *
+   * FFT_8 using w=4 as 8th root of unity
+   *  Unrolled decimation in frequency (DIF) radix-2 NTT.
+   *  Output data is in revbin_permuted order.
+   */
+
+  static const int w[] = {0, 2, 4, 6};
+  //  v16 *Twiddle = (v16*)FFT64_Twiddle;
+
+#define BUTTERFLY(i,j,n)                                \
+  do {                                                  \
+    MAYBE_VOLATILE v16 v = X(j);                              \
+    X(j) =  v16_add(X(i), X(j));                        \
+    if (n)                                              \
+      X(i) = v16_shift_l(v16_sub(X(i), v), w[n]);       \
+    else                                                \
+      X(i) = v16_sub(X(i), v);                          \
+  } while(0)
+
+  BUTTERFLY(0, 4, 0);
+  BUTTERFLY(1, 5, 1);
+  BUTTERFLY(2, 6, 2);
+  BUTTERFLY(3, 7, 3);
+  
+  DO_REDUCE(2);
+  DO_REDUCE(3);
+  
+  BUTTERFLY(0, 2, 0);
+  BUTTERFLY(4, 6, 0);
+  BUTTERFLY(1, 3, 2);
+  BUTTERFLY(5, 7, 2);
+  
+  DO_REDUCE(1);
+  
+  BUTTERFLY(0, 1, 0);
+  BUTTERFLY(2, 3, 0);
+  BUTTERFLY(4, 5, 0);
+  BUTTERFLY(6, 7, 0);
+  
+  /* We don't need to reduce X(7) */
+  DO_REDUCE_FULL_S(0);
+  DO_REDUCE_FULL_S(1);
+  DO_REDUCE_FULL_S(2);
+  DO_REDUCE_FULL_S(3);
+  DO_REDUCE_FULL_S(4);
+  DO_REDUCE_FULL_S(5);
+  DO_REDUCE_FULL_S(6);
+    
+#undef BUTTERFLY
+
+  /*
+   * Multiply by twiddle factors
+   */
+
+  X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
+  X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
+  X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
+  X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
+  X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
+  X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
+  X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
+
+  /*
+   * Transpose the FFT state with a revbin order permutation
+   * on the rows and the column.
+   * This will make the full FFT_64 in order.
+   */
+
+#define INTERLEAVE(i,j)                          \
+  do {                                           \
+    v16 t1= X(i);                                \
+    v16 t2= X(j);                                \
+    X(i) = v16_interleavel(t1, t2);              \
+    X(j) = v16_interleaveh(t1, t2);              \
+  } while(0)
+
+  INTERLEAVE(1, 0);
+  INTERLEAVE(3, 2);
+  INTERLEAVE(5, 4);
+  INTERLEAVE(7, 6);
+
+  INTERLEAVE(2, 0);
+  INTERLEAVE(3, 1);
+  INTERLEAVE(6, 4);
+  INTERLEAVE(7, 5);
+
+  INTERLEAVE(4, 0);
+  INTERLEAVE(5, 1);
+  INTERLEAVE(6, 2);
+  INTERLEAVE(7, 3);
+
+#undef INTERLEAVE
+
+  /*
+   * Finish with 8 parallels DIT FFT_8
+   *
+   * FFT_8 using w=4 as 8th root of unity
+   *  Unrolled decimation in time (DIT) radix-2 NTT.
+   *  Intput data is in revbin_permuted order.
+   */
+  
+#define BUTTERFLY(i,j,n)                                \
+  do {                                                  \
+    MAYBE_VOLATILE v16 u = X(j);                              \
+    if (n)                                              \
+      X(i) = v16_shift_l(X(i), w[n]);                   \
+    X(j) = v16_sub(X(j), X(i));                         \
+    X(i) = v16_add(u, X(i));                            \
+  } while(0)
+
+  DO_REDUCE(0);
+  DO_REDUCE(1);
+  DO_REDUCE(2);
+  DO_REDUCE(3);
+  DO_REDUCE(4);
+  DO_REDUCE(5);
+  DO_REDUCE(6);
+  DO_REDUCE(7);
+  
+  BUTTERFLY(0, 1, 0);
+  BUTTERFLY(2, 3, 0);
+  BUTTERFLY(4, 5, 0);
+  BUTTERFLY(6, 7, 0);
+  
+  BUTTERFLY(0, 2, 0);
+  BUTTERFLY(4, 6, 0);
+  BUTTERFLY(1, 3, 2);
+  BUTTERFLY(5, 7, 2);
+  
+  DO_REDUCE(3);
+  
+  BUTTERFLY(0, 4, 0);
+  BUTTERFLY(1, 5, 1);
+  BUTTERFLY(2, 6, 2);
+  BUTTERFLY(3, 7, 3);
+  
+  DO_REDUCE_FULL_S(0);
+  DO_REDUCE_FULL_S(1);
+  DO_REDUCE_FULL_S(2);
+  DO_REDUCE_FULL_S(3);
+  DO_REDUCE_FULL_S(4);
+  DO_REDUCE_FULL_S(5);
+  DO_REDUCE_FULL_S(6);
+  DO_REDUCE_FULL_S(7);
+  
+#undef BUTTERFLY
+
+  A[0] = X0;
+  A[1] = X1;
+  A[2] = X2;
+  A[3] = X3;
+  A[4] = X4;
+  A[5] = X5;
+  A[6] = X6;
+  A[7] = X7;
+
+#undef X
+
+}
+
+
+MAYBE_INLINE void fft128(void *a) {
+
+  int i;
+
+  // Temp space to help for interleaving in the end
+  v16 B[8];
+
+  v16 *A = (v16*) a;
+  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
+
+  /* Size-2 butterflies */
+
+  for (i = 0; i<8; i++) {
+    B[i]   = v16_add(A[i], A[i+8]);
+    B[i]   = REDUCE_FULL_S(B[i]);
+    A[i+8] = v16_sub(A[i], A[i+8]);
+    A[i+8] = REDUCE_FULL_S(A[i+8]);
+    A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
+    A[i+8] = REDUCE_FULL_S(A[i+8]);
+  }
+
+  fft64(B);
+  fft64(A+8);
+
+  /* Transpose (i.e. interleave) */
+
+  for (i=0; i<8; i++) {
+    A[2*i]   = v16_interleavel (B[i], A[i+8]);
+    A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
+  }
+}
+
+#ifdef v16_broadcast
+/* Compute the FFT using a table
+ * The function works if the value of the message is smaller 
+ * than 2^14.
+ */
+void fft128_msg_final(short *a, const unsigned char *x) {
+
+  static const union cv FFT128_Final_Table[] = {
+    {{   1, -211,   60,  -67,    2,   92, -137,  123}},
+    {{   2,  118,   45,  111,   97,  -46,   49, -106}},
+    {{   4,  -73,  -17,  -11,    8,  111,  -34,  -22}},
+    {{ -68,   -4,   76,  -25,   96,  -96,  -68,   -9}},
+    {{  16,  -35,  -68,  -44,   32,  -70, -136,  -88}},
+    {{   0, -124,   17,   12,   -6,   57,   47,   -8}},
+    {{  64,  117,  -15,   81,  128,  -23,  -30,  -95}},
+    {{ -68,  -53,  -52,  -70,  -10, -117,   77,   21}},
+    {{  -1,  -46,  -60,   67,   -2,  -92, -120, -123}},
+    {{  -2, -118,  -45, -111,  -97,   46,  -49,  106}},
+    {{  -4,   73,   17,   11,   -8, -111,   34,   22}},
+    {{  68,    4,  -76,   25,  -96,   96,   68,    9}},
+    {{ -16, -222,   68,   44,  -32,   70, -121,   88}},
+    {{   0,  124,  -17,  -12,    6,  -57,  -47,    8}},
+    {{ -64, -117,   15,  -81, -128, -234,   30,   95}},
+    {{  68,   53,   52,   70,   10,  117,  -77,  -21}},
+    {{-118,  -31,  116,  -61,   21,  -62,  -25, -122}},
+    {{-101,  107,  -45,  -95,   -8,    3,  101,  -34}},
+    {{  42, -124,  -50,   13,   84,    9, -100, -231}},
+    {{ -79,  -53,   82,   65,  -81,   47,   61,  107}},
+    {{ -89, -239,   57, -205, -178,   36, -143,  104}},
+    {{-126,  113,   33,  111,  103, -109,   65, -114}},
+    {{ -99,   72,  -29,  -49, -198, -113,  -58,  -98}},
+    {{   8,  -27, -106,  -30,  111,    6,   10, -108}},
+    {{-139,   31, -116, -196,  -21,   62,   25, -135}},
+    {{ 101, -107,   45,   95,    8,   -3, -101,   34}},
+    {{ -42, -133,   50,  -13,  -84,   -9,  100,  -26}},
+    {{  79,   53,  -82,  -65,   81,  -47,  -61, -107}},
+    {{-168,  -18,  -57,  -52,  -79,  -36, -114, -104}},
+    {{ 126, -113,  -33, -111, -103,  109,  -65,  114}},
+    {{  99,  -72, -228,   49,  -59,  113,   58, -159}},
+    {{  -8,   27,  106,   30, -111,   -6,  -10,  108}}
+  };
+
+  //  v16 *Table = (v16*)FFT128_Final_Table;
+  v16 *A = (v16*) a;
+  int i;
+
+  v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
+  v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
+  // v16 msg2 = v16_broadcast(x[1]);
+
+#if 0
+
+  for (i=0; i<16; i++) {
+    v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16  , msg2);
+    v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
+    sum = v16_add(sum, tmp);
+    A[i] = REDUCE_FULL_S(sum);
+  }
+
+#else
+
+#define FFT_FINAL(i)                                           \
+  v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2);     \
+  v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);   \
+  sum##i = v16_add(sum##i, tmp##i);                            \
+  A[i] = REDUCE_FULL_S(sum##i);
+
+  FFT_FINAL(0)
+  FFT_FINAL(1)
+  FFT_FINAL(2)
+  FFT_FINAL(3)
+  FFT_FINAL(4)
+  FFT_FINAL(5)
+  FFT_FINAL(6)
+  FFT_FINAL(7)
+  FFT_FINAL(8)
+  FFT_FINAL(9)
+  FFT_FINAL(10)
+  FFT_FINAL(11)
+  FFT_FINAL(12)
+  FFT_FINAL(13)
+  FFT_FINAL(14)
+  FFT_FINAL(15)
+
+#endif
+
+}
+#endif
+
+void fft128_msg(short *a, const unsigned char *x, int final) {
+
+  static const union cv Tweak =
+    {{0,0,0,0,0,0,0,1}};
+  static const union cv FinalTweak =
+    {{0,0,0,0,0,1,0,1}};
+
+
+  v8  *X = (v8*)  x;
+  v16 *A = (v16*) a;
+  //  v16 *Twiddle = (v16*)FFT128_Twiddle;
+
+#define UNPACK(i)                                      \
+  do {                                                 \
+    v8 t = X[i];                                       \
+    A[2*i]   = v8_mergel(t, V0.v8);                    \
+    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
+    A[2*i+8] = REDUCE(A[2*i+8]);                       \
+    A[2*i+1] = v8_mergeh(t, V0.v8);                    \
+    A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16);      \
+    A[2*i+9] = REDUCE(A[2*i+9]);                       \
+  } while(0)
+
+
+  /* 
+   * This allows to tweak the last butterflies to introduce X^127
+   */
+#define UNPACK_TWEAK(i,tw)                             \
+  do {                                                 \
+    v8 t = X[i];                                       \
+    v16 tmp;                                           \
+    A[2*i]   = v8_mergel(t, V0.v8);                    \
+    A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16);          \
+    A[2*i+8] = REDUCE(A[2*i+8]);                       \
+    tmp      = v8_mergeh(t, V0.v8);                    \
+    A[2*i+1] = v16_add(tmp, tw);                               \
+    A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16);      \
+    A[2*i+9] = REDUCE(A[2*i+9]);                       \
+  } while(0)
+
+  UNPACK(0);
+  UNPACK(1);
+  UNPACK(2);
+  if (final)
+    UNPACK_TWEAK(3, FinalTweak.v16);
+  else
+    UNPACK_TWEAK(3, Tweak.v16);
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft64(a);
+  fft64(a+64);
+}
+
+#if 0
+void fft128_msg(short *a, const unsigned char *x, int final) {
+
+  for (int i=0; i<64; i++)
+    a[i] = x[i];
+
+  for (int i=64; i<128; i++)
+    a[i] = 0;
+
+  a[127] = 1;
+  a[125] = final? 1: 0;
+
+  fft128(a);
+}
+#endif
+
+void fft256_msg(short *a, const unsigned char *x, int final) {
+
+  static const union cv Tweak =
+    {{0,0,0,0,0,0,0,1}};
+  static const union cv FinalTweak =
+    {{0,0,0,0,0,1,0,1}};
+
+
+  v8  *X = (v8*)  x;
+  v16 *A = (v16*) a;
+  //  v16 *Twiddle = (v16*)FFT256_Twiddle;
+
+#define UNPACK(i)                                       \
+  do {                                                  \
+    v8 t      = X[i];                                   \
+    A[2*i]    = v8_mergel(t, V0.v8);                    \
+    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
+    A[2*i+16] = REDUCE(A[2*i+16]);                      \
+    A[2*i+1]  = v8_mergeh(t, V0.v8);                    \
+    A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16);      \
+    A[2*i+17] = REDUCE(A[2*i+17]);                       \
+  } while(0)
+
+
+  /* 
+   * This allows to tweak the last butterflies to introduce X^127
+   */
+#define UNPACK_TWEAK(i,tw)                              \
+  do {                                                  \
+    v8 t = X[i];                                        \
+    v16 tmp;                                            \
+    A[2*i]    = v8_mergel(t, V0.v8);                    \
+    A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16);          \
+    A[2*i+16] = REDUCE(A[2*i+16]);                       \
+    tmp       = v8_mergeh(t, V0.v8);                    \
+    A[2*i+1]  = v16_add(tmp, tw);                               \
+    A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16);      \
+    A[2*i+17] = REDUCE(A[2*i+17]);                      \
+  } while(0)
+
+  UNPACK(0);
+  UNPACK(1);
+  UNPACK(2);
+  UNPACK(3);
+  UNPACK(4);
+  UNPACK(5);
+  UNPACK(6);
+  if (final)
+    UNPACK_TWEAK(7, FinalTweak.v16);
+  else
+    UNPACK_TWEAK(7, Tweak.v16);
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft128(a);
+  fft128(a+128);
+}
+
+
+void rounds(u32* state, const unsigned char* msg, short* fft) {
+  
+  v32* S = (v32*) state;
+  const v32* M = (v32*)msg;
+  volatile v16* W = (v16*)fft;
+
+  register v32 S0, S1, S2, S3;
+  static const union cv code[] = { CV(185), CV(233) };
+
+  S0 = v32_xor(S[0], v32_bswap(M[0]));
+  S1 = v32_xor(S[1], v32_bswap(M[1]));
+  S2 = v32_xor(S[2], v32_bswap(M[2]));
+  S3 = v32_xor(S[3], v32_bswap(M[3]));
+
+#define S(i) S##i
+
+
+/* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
+/* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
+
+#define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
+#define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
+
+#define F(a,b,c,fun) F_##fun (a,b,c)
+
+  /*
+   * We split the round function in two halfes
+   * so as to insert some independent computations in between
+   */
+
+#define SUM3_00 1
+#define SUM3_01 2
+#define SUM3_02 3
+#define SUM3_10 2
+#define SUM3_11 3
+#define SUM3_12 1
+#define SUM3_20 3
+#define SUM3_21 1
+#define SUM3_22 2
+
+#define STEP_1(a,b,c,d,w,fun,r,s,z)                             \
+  do {                                                          \
+    if (PRINT_SOME) {                                           \
+      int j;                                                    \
+      v32 ww=w, aa=a, bb=b, cc=c, dd=d;                         \
+      u32 *WW = (void*)&ww;                                     \
+      u32 *AA = (void*)&aa;                                     \
+      u32 *BB = (void*)&bb;                                     \
+      u32 *CC = (void*)&cc;                                     \
+      u32 *DD = (void*)&dd;                                     \
+      for (j=0; j<4; j++) {                                     \
+        printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n",      \
+                WW[j], r, s, SUM3_##z,                          \
+                AA[j], BB[j], CC[j], DD[j]);                    \
+      }                                                         \
+    }                                                           \
+    TT = F(a,b,c,fun);                                          \
+    a = v32_rotate(a,r);                                        \
+    w = v32_add(w, d);                                          \
+    TT = v32_add(TT, w);                                        \
+    TT = v32_rotate(TT,s);                                      \
+    d = v32_shufxor(a,SUM3_##z);                                \
+  } while(0)
+
+#define STEP_2(a,b,c,d,w,fun,r,s)                               \
+  do {                                                          \
+    d = v32_add(d, TT);                                         \
+  } while(0)
+
+#define STEP(a,b,c,d,w,fun,r,s,z)               \
+  do {                                          \
+    register v32 TT;                            \
+    STEP_1(a,b,c,d,w,fun,r,s,z);                \
+    STEP_2(a,b,c,d,w,fun,r,s);                  \
+  } while(0);
+
+
+#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
+              fun,r,s,t,u,z,r0)                           \
+  do {                                                    \
+    register v32 W0, W1, W2, W3, TT;                      \
+    W0 = v16_merge##u0(W[h0], W[l0]);                     \
+    W0 = V1632(v16_mul(V3216(W0), code[z].v16));          \
+    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
+    W1 = v16_merge##u1(W[h1], W[l1]);                     \
+    W1 = V1632(v16_mul(V3216(W1), code[z].v16));          \
+    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
+    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
+    W2 = v16_merge##u2(W[h2], W[l2]);                     \
+    W2 = V1632(v16_mul(V3216(W2), code[z].v16));          \
+    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
+    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
+    W3 = v16_merge##u3(W[h3], W[l3]);                     \
+    W3 = V1632(v16_mul(V3216(W3), code[z].v16));          \
+    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
+    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
+    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
+  } while(0)
+
+
+  /*
+   * 4 rounds with code 185
+   */
+  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0, 0);
+  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0, 1);
+  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22,  7, 0, 2);
+  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22,  7, 0, 0);
+
+  /*
+   * 4 rounds with code 233
+   */
+  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1, 1);
+  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1, 2);
+  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1, 0);
+  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1, 1);
+
+
+  /*
+   * 1 round as feed-forward
+   */
+  STEP(S(0), S(1), S(2), S(3), S[0], 0,  4, 13, 20);
+  STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
+  STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
+  STEP(S(1), S(2), S(3), S(0), S[3], 0, 25,  4, 20);
+
+  S[0] = S(0);  S[1] = S(1);  S[2] = S(2);  S[3] = S(3);
+}
+
+
+void rounds512(u32* state, const unsigned char* msg, short* fft) {
+  
+  v32* S = (v32*) state;
+  v32* M = (v32*) msg;
+  v16* W = (v16*) fft;
+
+  register v32 S0l, S1l, S2l, S3l;
+  register v32 S0h, S1h, S2h, S3h;
+  static const union cv code[] = { CV(185), CV(233) };
+
+  S0l = v32_xor(S[0], v32_bswap(M[0]));
+  S0h = v32_xor(S[1], v32_bswap(M[1]));
+  S1l = v32_xor(S[2], v32_bswap(M[2]));
+  S1h = v32_xor(S[3], v32_bswap(M[3]));
+  S2l = v32_xor(S[4], v32_bswap(M[4]));
+  S2h = v32_xor(S[5], v32_bswap(M[5]));
+  S3l = v32_xor(S[6], v32_bswap(M[6]));
+  S3h = v32_xor(S[7], v32_bswap(M[7]));
+
+#define S(i) S##i
+
+
+/* #define F_0(B, C, D)     ((((C) ^ (D)) & (B)) ^ (D)) */
+/* #define F_1(B, C, D)     (((D) & (C)) | (((D) | (C)) & (B))) */
+
+#define F_0(B, C, D)     v32_xor(v32_and(v32_xor(C,D), B), D)
+#define F_1(B, C, D)     v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
+
+#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
+#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
+
+  /*
+   * We split the round function in two halfes
+   * so as to insert some independent computations in between
+   */
+
+#define SUM7_00 0
+#define SUM7_01 1
+#define SUM7_02 2
+#define SUM7_03 3
+#define SUM7_04 4
+#define SUM7_05 5
+#define SUM7_06 6
+
+#define SUM7_10 1
+#define SUM7_11 2
+#define SUM7_12 3
+#define SUM7_13 4
+#define SUM7_14 5
+#define SUM7_15 6
+#define SUM7_16 0
+                
+#define SUM7_20 2
+#define SUM7_21 3
+#define SUM7_22 4
+#define SUM7_23 5
+#define SUM7_24 6
+#define SUM7_25 0
+#define SUM7_26 1
+                
+#define SUM7_30 3
+#define SUM7_31 4
+#define SUM7_32 5
+#define SUM7_33 6
+#define SUM7_34 0
+#define SUM7_35 1
+#define SUM7_36 2
+                
+#define SUM7_40 4
+#define SUM7_41 5
+#define SUM7_42 6
+#define SUM7_43 0
+#define SUM7_44 1
+#define SUM7_45 2
+#define SUM7_46 3
+                
+#define SUM7_50 5
+#define SUM7_51 6
+#define SUM7_52 0
+#define SUM7_53 1
+#define SUM7_54 2
+#define SUM7_55 3
+#define SUM7_56 4
+
+#define SUM7_60 6
+#define SUM7_61 0
+#define SUM7_62 1
+#define SUM7_63 2
+#define SUM7_64 3
+#define SUM7_65 4
+#define SUM7_66 5
+
+#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
+
+#define PERM_0(d,a) /* XOR 1 */           \
+  do {                                    \
+    d##l = v32_shufxor(a##l,1);           \
+    d##h = v32_shufxor(a##h,1);           \
+  } while(0)
+
+#define PERM_1(d,a) /* XOR 6 */           \
+  do {                                    \
+    d##l = v32_shufxor(a##h,2);           \
+    d##h = v32_shufxor(a##l,2);           \
+  } while(0)
+
+#define PERM_2(d,a) /* XOR 2 */           \
+  do {                                    \
+    d##l = v32_shufxor(a##l,2);           \
+    d##h = v32_shufxor(a##h,2);           \
+  } while(0)
+
+#define PERM_3(d,a) /* XOR 3 */           \
+  do {                                    \
+    d##l = v32_shufxor(a##l,3);           \
+    d##h = v32_shufxor(a##h,3);           \
+  } while(0)
+
+#define PERM_4(d,a) /* XOR 5 */           \
+  do {                                    \
+    d##l = v32_shufxor(a##h,1);           \
+    d##h = v32_shufxor(a##l,1);           \
+  } while(0)
+
+#define PERM_5(d,a) /* XOR 7 */           \
+  do {                                    \
+    d##l = v32_shufxor(a##h,3);           \
+    d##h = v32_shufxor(a##l,3);           \
+  } while(0)
+
+#define PERM_6(d,a) /* XOR 4 */           \
+  do {                                    \
+    d##l = a##h;                          \
+    d##h = a##l;                          \
+  } while(0)
+
+#define STEP_1_(a,b,c,d,w,fun,r,s,z)                            \
+  do {                                                          \
+    if (PRINT_SOME) {                                           \
+      int j;                                                    \
+      v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l;          \
+      u32 *WW = (void*)&ww;                                     \
+      u32 *AA = (void*)&aa;                                     \
+      u32 *BB = (void*)&bb;                                     \
+      u32 *CC = (void*)&cc;                                     \
+      u32 *DD = (void*)&dd;                                     \
+      for (j=0; j<4; j++) {                                     \
+        printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n",          \
+                WW[j], r, s,                                    \
+                AA[j], BB[j], CC[j], DD[j]);                    \
+      }                                                         \
+    }                                                           \
+    TTl = Fl(a,b,c,fun);                                        \
+    TTh = Fh(a,b,c,fun);                                        \
+    a##l = v32_rotate(a##l,r);                                  \
+    a##h = v32_rotate(a##h,r);                                  \
+    w##l  = v32_add(w##l, d##l);                                \
+    w##h  = v32_add(w##h, d##h);                                \
+    TTl = v32_add(TTl, w##l);                                   \
+    TTh = v32_add(TTh, w##h);                                   \
+    TTl = v32_rotate(TTl,s);                                    \
+    TTh = v32_rotate(TTh,s);                                    \
+    PERM(z,d,a);                                                \
+  } while(0)
+
+#define STEP_1(a,b,c,d,w,fun,r,s,z)             \
+  STEP_1_(a,b,c,d,w,fun,r,s,z)
+
+#define STEP_2_(a,b,c,d,w,fun,r,s)                               \
+  do {                                                          \
+    d##l = v32_add(d##l, TTl);                                  \
+    d##h = v32_add(d##h, TTh);                                  \
+  } while(0)
+
+#define STEP_2(a,b,c,d,w,fun,r,s)              \
+  STEP_2_(a,b,c,d,w,fun,r,s)
+  
+#define STEP(a,b,c,d,w1,w2,fun,r,s,z)           \
+  do {                                          \
+    register v32 TTl, TTh, Wl=w1, Wh=w2;        \
+    STEP_1(a,b,c,d,W,fun,r,s,z);                \
+    STEP_2(a,b,c,d,W,fun,r,s);                  \
+  } while(0);
+
+
+#define MSG_l(x) (2*(x))
+#define MSG_h(x) (2*(x)+1)
+
+#define MSG(w,hh,ll,u,z)                                \
+  do {                                                  \
+    int a = MSG_##u(hh);                                \
+    int b = MSG_##u(ll);                                \
+    w##l = v16_mergel(W[a], W[b]);                      \
+    w##l = V1632(v16_mul(V3216(w##l), code[z].v16));    \
+    w##h = v16_mergeh(W[a], W[b]);                      \
+    w##h = V1632(v16_mul(V3216(w##h), code[z].v16));    \
+  } while(0)
+  
+#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,        \
+              fun,r,s,t,u,z)                              \
+  do {                                                    \
+    register v32 W0l, W1l, W2l, W3l, TTl;                 \
+    register v32 W0h, W1h, W2h, W3h, TTh;                 \
+    MSG(W0,h0,l0,u0,z);                                   \
+    STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0);     \
+    MSG(W1,h1,l1,u1,z);                                   \
+    STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s);        \
+    STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1);     \
+    MSG(W2,h2,l2,u2,z);                                   \
+    STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t);        \
+    STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2);     \
+    MSG(W3,h3,l3,u3,z);                                   \
+    STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u);        \
+    STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3);     \
+    STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r);        \
+  } while(0)
+
+
+  /*
+   * 4 rounds with code 185
+   */
+#define PERM_START 0
+  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+#define PERM_START 4
+  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
+#define PERM_START 1
+  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
+#define PERM_START 5
+  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
+
+  /*
+   * 4 rounds with code 233
+   */
+#define PERM_START 2
+  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+#define PERM_START 6
+  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
+#define PERM_START 3
+  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
+#define PERM_START 0
+  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
+
+
+  /*
+   * 1 round as feed-forward
+   */
+#define PERM_START 4
+  STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0);
+  STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
+  STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
+  STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3);
+
+  S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
+  S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
+}
+
+void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
+  if (state->hashbitlen <= 256) {
+    union cv Y[16];
+    short* y = (short*) Y[0].u16;
+
+#ifdef v16_broadcast
+    if (final == 2) {
+      fft128_msg_final(y, m);
+      rounds(state->A, m, y);
+    } else {
+      fft128_msg(y, m, final);
+      rounds(state->A, m, y);
+    }
+#else
+    fft128_msg(y, m, final);
+    rounds(state->A, m, y);
+#endif
+  } else {
+    union cv Y[32];
+    short* y = (short*) Y[0].u16;
+    
+    fft256_msg(y, m, final);
+    rounds512(state->A, m, y);
+  }
+}
+
+/* 
+ * Give the FFT output in the regular order for consitancy checks
+ */
+void fft128_natural(fft_t *x, unsigned char *a) {
+  union cv Y[16];
+  short* y = (short*) Y[0].u16;
+  int i;
+
+  fft128_msg(y, a, 0);
+
+  for(i=0; i<64; i++) {
+    x[2*i]   = y[i];
+    x[2*i+1] = y[i+64];
+  }
+}
--- a/algo/simd/sse2/vector.h
+++ b/algo/simd/sse2/vector.h
@@ -0,0 +1,389 @@
+#ifndef __VECTOR_H__
+#define __VECTOR_H__
+
+#include "compat.h"
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+
+/******************************* 
+ * Using GCC vector extensions * 
+ *******************************/
+
+#if   defined(__SSE2__)
+
+//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
+typedef char          v16qi __attribute__ ((vector_size (16)));
+typedef short          v8hi __attribute__ ((vector_size (16)));
+typedef int            v4si __attribute__ ((vector_size (16)));
+typedef float          v4sf __attribute__ ((vector_size (16)));
+typedef long long int  v2di __attribute__ ((vector_size (16)));
+
+typedef short          v4hi __attribute__ ((vector_size (8)));
+typedef unsigned char  v8qi __attribute__ ((vector_size (8)));
+
+typedef v16qi v8;
+typedef v8hi v16;
+typedef v4si v32;
+#define V16_SIZE 8
+
+union cv {
+  unsigned short u16[8];
+  v16 v16;
+};
+
+union cv8 {
+  unsigned char u8[16];
+  v8 v8;
+};
+
+union u32 {
+  u32 u[4];
+  v32 v;
+};
+
+#define V3216(x) ((v16) (x))
+#define V1632(x) ((v32) (x))
+#define  V168(x) ( (v8) (x))
+#define  V816(x) ((v16) (x))
+
+#if 0
+/* These instruction are shorter than the PAND/POR/... that GCC uses */
+
+#define vec_and(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
+#define vec_or(x,y)   ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
+#define vec_xor(x,y)  ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
+#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y;  __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
+
+#define v16_and(x,y)  ((v16) vec_and ((x), (y)))
+#define v16_or(x,y)   ((v16) vec_or  ((x), (y)))
+#define v16_xor(x,y)  ((v16) vec_xor ((x), (y)))
+#define v16_andn(x,y) ((v16) vec_andn((x), (y)))
+
+#define v32_and(x,y)  ((v32) vec_and ((x), (y)))
+#define v32_or(x,y)   ((v32) vec_or  ((x), (y)))
+#define v32_xor(x,y)  ((v32) vec_xor ((x), (y)))
+#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
+#endif
+
+#define vec_and(x,y) ((x)&(y))
+#define vec_or(x,y)  ((x)|(y))
+#define vec_xor(x,y) ((x)^(y))
+
+#define v16_and vec_and
+#define v16_or  vec_or
+#define v16_xor vec_xor
+
+#define v32_and vec_and
+#define v32_or  vec_or
+#define v32_xor vec_xor
+
+#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
+#define v16_andn(x,y) ((v16) vec_andn(x,y))
+#define v32_andn(x,y) ((v32) vec_andn(x,y))
+
+#define v32_add(x,y) ((x)+(y))
+
+#define v16_add(x,y) ((x)+(y))
+#define v16_sub(x,y) ((x)-(y))
+#define v16_mul(x,y) ((x)*(y))
+#define v16_neg(x)   (-(x))
+#define v16_shift_l  __builtin_ia32_psllwi128
+#define v16_shift_r  __builtin_ia32_psrawi128
+#define v16_cmp      __builtin_ia32_pcmpgtw128
+
+#define v16_interleavel   __builtin_ia32_punpcklwd128
+#define v16_interleaveh   __builtin_ia32_punpckhwd128
+
+#define v16_mergel(a,b)   V1632(__builtin_ia32_punpcklwd128(a,b))
+#define v16_mergeh(a,b)   V1632(__builtin_ia32_punpckhwd128(a,b))
+
+#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
+#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
+
+#define v32_shift_l  __builtin_ia32_pslldi128
+#define v32_shift_r  __builtin_ia32_psrldi128
+
+#define v32_rotate(x,n)                                 \
+  v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
+
+#define v32_shuf __builtin_ia32_pshufd
+
+#define SHUFXOR_1 0xb1          /* 0b10110001 */
+#define SHUFXOR_2 0x4e          /* 0b01001110 */
+#define SHUFXOR_3 0x1b          /* 0b00011011 */
+
+#define CAT(x, y) x##y
+#define XCAT(x,y) CAT(x,y)
+
+#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
+
+#define v32_bswap(x) (x)
+
+#define v16_broadcast(x) ({                     \
+      union u32 u;                              \
+      u32 xx = x;                               \
+      u.u[0] = xx | (xx << 16);                 \
+      V3216(v32_shuf(u.v,0)); })
+
+#define CV(x) {{x, x, x, x, x, x, x, x}}
+
+#elif defined(__ALTIVEC__)
+
+#include <altivec.h>
+
+typedef vector unsigned char  v8;
+typedef vector signed   short v16;
+typedef vector unsigned int   v32;
+
+#define V3216(x) ((v16) (x))
+#define V1632(x) ((v32) (x))
+#define  V168(x) ( (v8) (x))
+#define  V816(x) ((v16) (x))
+
+#define V16_SIZE 8
+#define print_vec print_sse
+
+#define MAKE_VECT(x, ...) {{x, __VA_ARGS__}}
+
+#define CV(x) MAKE_VECT(x, x, x, x, x, x, x, x)
+#define CV16(x)  ((vector   signed short) {x,x,x,x,x,x,x,x})
+#define CVU16(x) ((vector unsigned short) {x,x,x,x,x,x,x,x})
+#define CV32(x)  ((vector unsigned int  ) {x,x,x,x})
+
+union cv {
+  unsigned short u16[8];
+  v16 v16;
+};
+
+union cv8 {
+  unsigned char u8[16];
+  v8 v8;
+};
+
+union ucv {
+  unsigned short u16[8];
+  vector unsigned char v16;
+};
+
+// Nasty hack to avoid macro expansion madness
+
+
+/* altivec.h is broken with Gcc 3.3 is C99 mode  */
+#if defined __STDC__ && __STDC_VERSION__ >= 199901L
+#define typeof __typeof
+#endif
+
+MAYBE_INLINE v16 vec_and_fun (v16 x, v16 y) {
+  return vec_and (x, y);
+}
+
+MAYBE_INLINE v16 vec_or_fun (v16 x, v16 y) {
+  return vec_or (x, y);
+}
+
+MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
+  return vec_xor (x, y);
+}
+
+#undef vec_and
+#undef vec_or
+#undef vec_xor
+
+#define vec_and(x,y) ((__typeof(x)) vec_and_fun((v16) x, (v16) y))
+#define vec_or(x,y)  ((__typeof(x)) vec_or_fun((v16) x, (v16) y))
+#define vec_xor(x,y) ((__typeof(x)) vec_xor_fun((v16) x, (v16) y))
+
+
+#define v16_and vec_and
+#define v16_or  vec_or
+#define v16_xor vec_xor
+
+#define v32_and vec_and
+#define v32_or  vec_or
+#define v32_xor vec_xor
+
+
+#define v32_add vec_add
+
+#define v16_add vec_add
+#define v16_sub vec_sub
+#define v16_mul(a,b) vec_mladd(a,b,CV16(0))
+
+vector unsigned   short ZZ = {0,0,0,0,0,0,0,0};
+
+v16 v16_shift_l(v16 x,int s) {
+  vector unsigned short shift = {s,s,s,s,s,s,s,s};
+  v16 y = vec_sl (x, shift);
+  return y;
+}
+#define v16_shift_l(x,s)  vec_sl (x,CVU16(s))
+#define v16_shift_r(x,s)  vec_sra(x,CVU16(s))
+#define v16_cmp      vec_cmpgt
+
+#define v16_mergel(a,b)   V1632(vec_mergeh(b,a))
+#define v16_mergeh(a,b)   V1632(vec_mergel(b,a))
+
+#define v16_interleavel(a,b)   vec_mergeh(a,b)
+#define v16_interleaveh(a,b)   vec_mergel(a,b)
+
+#define v8_mergel(a,b) V816(vec_mergeh(b,a))
+#define v8_mergeh(a,b) V816(vec_mergel(b,a))
+
+#define v32_rotate(x,s)  vec_rl(x,CV32(s))
+
+// #define v32_unpckl   vec_mergel
+// #define v32_unpckh   vec_mergeh
+
+#define vector_shuffle(x,s) vec_perm(x,x,s)
+
+static const v8 SHUFXOR_1 = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
+static const v8 SHUFXOR_2 = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7};
+static const v8 SHUFXOR_3 = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
+
+#define v32_shufxor(x,s) vector_shuffle(x,SHUFXOR_##s)
+
+//static const v8 SHUFSWAP = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
+static const v8 SHUFSWAP = {3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12};
+
+#define v32_bswap(x) vector_shuffle(x,SHUFSWAP)
+
+#else
+
+#error "I don't know how to vectorize on this architecture."
+
+#endif
+
+#else
+
+/******************************** 
+ * Using MSVC/ICC vector instrinsics * 
+ ********************************/
+
+#include <emmintrin.h>
+
+typedef __m128i  v8;
+typedef __m128i v16;
+typedef __m128i v32;
+
+#define V3216(x) (x)
+#define V1632(x) (x)
+#define  V168(x) (x)
+#define  V816(x) (x)
+
+#define V16_SIZE 8
+
+union cv {
+  unsigned short u16[8];
+  v16 v16;
+};
+
+union cv8 {
+  unsigned char u8[16];
+  v8 v8;
+};
+
+#define CV(x) {{x, x, x, x, x, x, x, x}}
+
+#define vec_and      _mm_and_si128
+#define vec_or       _mm_or_si128
+#define vec_xor      _mm_xor_si128
+
+#define v16_and vec_and
+#define v16_or  vec_or
+#define v16_xor vec_xor
+
+#define v32_and vec_and
+#define v32_or  vec_or
+#define v32_xor vec_xor
+
+#define vector_shuffle(x,s) _mm_shuffle_epi8(x, s)
+
+#define v32_add      _mm_add_epi32
+
+#define v16_add      _mm_add_epi16
+#define v16_sub      _mm_sub_epi16
+#define v16_mul      _mm_mullo_epi16
+#define v16_neg(x)   (-(x))
+#define v16_shift_l  _mm_slli_epi16
+#define v16_shift_r  _mm_srai_epi16
+#define v16_cmp      _mm_cmpgt_epi16
+
+#define v16_interleavel   _mm_unpacklo_epi16
+#define v16_interleaveh   _mm_unpackhi_epi16
+
+#define v16_mergel   _mm_unpacklo_epi16
+#define v16_mergeh   _mm_unpackhi_epi16
+
+#define v8_mergel    _mm_unpacklo_epi8
+#define v8_mergeh    _mm_unpackhi_epi8
+
+#define v32_shift_l  _mm_slli_epi32
+#define v32_shift_r  _mm_srli_epi32
+
+#define v32_rotate(x,n)                                 \
+  vec_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
+
+#define v32_shuf     _mm_shuffle_epi32
+
+#define SHUFXOR_1 0xb1          /* 0b10110001 */
+#define SHUFXOR_2 0x4e          /* 0b01001110 */
+#define SHUFXOR_3 0x1b          /* 0b00011011 */
+
+#define CAT(x, y) x##y
+#define XCAT(x,y) CAT(x,y)
+
+//#define v32_shufxor(x,s) v32_shuf(x,SHUFXOR_##s)
+#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
+
+#define v32_bswap(x) (x)
+
+#endif
+
+/* Twiddle tables */
+
+  static const union cv FFT64_Twiddle[] = {
+    {{1,    2,    4,    8,   16,   32,   64,  128}},
+    {{1,   60,    2,  120,    4,  -17,    8,  -34}},
+    {{1,  120,    8,  -68,   64,  -30,   -2,   17}},
+    {{1,   46,   60,  -67,    2,   92,  120,  123}},
+    {{1,   92,  -17,  -22,   32,  117,  -30,   67}},
+    {{1,  -67,  120,  -73,    8,  -22,  -68,  -70}},
+    {{1,  123,  -34,  -70,  128,   67,   17,   35}},
+  };
+
+
+  static const union cv FFT128_Twiddle[] =  {
+    {{  1, -118,   46,  -31,   60,  116,  -67,  -61}},
+    {{  2,   21,   92,  -62,  120,  -25,  123, -122}},
+    {{  4,   42,  -73, -124,  -17,  -50,  -11,   13}},
+    {{  8,   84,  111,    9,  -34, -100,  -22,   26}},
+    {{ 16,  -89,  -35,   18,  -68,   57,  -44,   52}},
+    {{ 32,   79,  -70,   36,  121,  114,  -88,  104}},
+    {{ 64,  -99,  117,   72,  -15,  -29,   81,  -49}},
+    {{128,   59,  -23, -113,  -30,  -58,  -95,  -98}},
+  };
+
+
+  static const union cv FFT256_Twiddle[] =  {
+    {{   1,   41, -118,   45,   46,   87,  -31,   14}}, 
+    {{  60, -110,  116, -127,  -67,   80,  -61,   69}}, 
+    {{   2,   82,   21,   90,   92,  -83,  -62,   28}}, 
+    {{ 120,   37,  -25,    3,  123,  -97, -122, -119}}, 
+    {{   4,  -93,   42,  -77,  -73,   91, -124,   56}}, 
+    {{ -17,   74,  -50,    6,  -11,   63,   13,   19}}, 
+    {{   8,   71,   84,  103,  111,  -75,    9,  112}}, 
+    {{ -34, -109, -100,   12,  -22,  126,   26,   38}}, 
+    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33}}, 
+    {{ -68,   39,   57,   24,  -44,   -5,   52,   76}}, 
+    {{  32,   27,   79, -102,  -70,  -43,   36,  -66}}, 
+    {{ 121,   78,  114,   48,  -88,  -10,  104, -105}}, 
+    {{  64,   54,  -99,   53,  117,  -86,   72,  125}}, 
+    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47}}, 
+    {{ 128,  108,   59,  106,  -23,   85, -113,   -7}}, 
+    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94}}
+  };
+
+
+
+
+#endif