v3.11.7

v3.11.6
v3.11.5
2025-09-17 23:44:27 +00:00 · 2020-01-26 04:33:39 -05:00 · 2020-01-23 00:11:08 -05:00 · 2020-01-18 15:14:27 -05:00 · 2020-01-08 14:44:47 -05:00 · 2020-01-02 23:54:08 -05:00
274 changed files with 11965 additions and 19093 deletions
--- a/3
+++ b/3
@@ -33,3 +33,6 @@ Jay D Dee
 xcouiz@gmail.com

 Cryply
+
+Colin Percival
+Alexander Peslyak
--- a/Makefile.am
+++ b/Makefile.am
@@ -80,15 +80,18 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
-  algo/cubehash/sph_cubehash.c \
  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
  algo/echo/echo-hash-4way.c \
  algo/echo/aes_ni/hash.c\
  algo/gost/sph_gost.c \
+  algo/groestl/groestl-gate.c \
+  algo/groestl/groestl512-hash-4way.c \
+  algo/groestl/groestl256-hash-4way.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
+  algo/groestl/groestl-4way.c \
  algo/groestl/myrgr-gate.c \
  algo/groestl/myrgr-4way.c \
  algo/groestl/myr-groestl.c \
@@ -117,7 +120,8 @@ cpuminer_SOURCES = \
  algo/keccak/keccak-hash-4way.c \
  algo/keccak/keccak-4way.c\
  algo/keccak/keccak-gate.c \
-  algo/keccak/sse2/keccak.c \
+  algo/keccak/sha3d-4way.c \
+  algo/keccak/sha3d.c \
  algo/lanehash/lane.c \
  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
@@ -147,6 +151,7 @@ cpuminer_SOURCES = \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
+  algo/panama/panama-hash-4way.c \
  algo/panama/sph_panama.c \
  algo/radiogatun/sph_radiogatun.c \
  algo/quark/quark-gate.c \
@@ -172,11 +177,11 @@ cpuminer_SOURCES = \
  algo/scrypt/scrypt.c \
  algo/scrypt/neoscrypt.c \
  algo/scrypt/pluck.c \
-  algo/scryptjane/scrypt-jane.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
+  algo/sha/hmac-sha256-hash.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
@@ -188,6 +193,7 @@ cpuminer_SOURCES = \
  algo/shavite/sph_shavite.c \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
+  algo/shavite/shavite-hash-4way.c \
  algo/shavite/shavite.c \
  algo/simd/sph_simd.c \
  algo/simd/nist.c \
@@ -288,12 +294,11 @@ cpuminer_SOURCES = \
  algo/x22/x25x.c \
  algo/x22/x25x-4way.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/sha256_Y.c \
  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower-gate.c \
  algo/yespower/yespower-blake2b.c \
  algo/yespower/crypto/blake2b-yp.c \
-  algo/yespower/sha256_p.c \
+  algo/yespower/yescrypt-r8g.c \
  algo/yespower/yespower-opt.c

 disable_flags =
--- a/README.md
+++ b/README.md
@@ -97,10 +97,10 @@ Supported Algorithms
                          qubit         Qubit
                          scrypt        scrypt(1024, 1, 1) (default)
                          scrypt:N      scrypt(N, 1, 1)
-                          scryptjane:nf
                          sha256d       Double SHA-256
                          sha256q       Quad SHA-256, Pyrite (PYE)
                          sha256t       Triple SHA-256, Onecoin (OC)
+                          sha3d         Double keccak256 (BSHA3)
                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
@@ -134,6 +134,7 @@ Supported Algorithms
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
+                          yescryptr8g   Koto (KOTO)
                          yescryptr16   Eli
                          yescryptr32   WAVI
                          yespower      Cryply
--- a/137
+++ b/137
@@ -8,9 +8,10 @@ Security warning

 Miner programs are often flagged as malware by antivirus programs. This is
 usually a false positive, they are flagged simply because they are
-cryptocurrency miners. However, some malware has been spread using the
-cover that miners are known to be subject to false positives. Always be on
-alert. The source code of cpuminer-opt is open for anyone to inspect.
+cryptocurrency miners. However, some malware masquerading as a miner has
+been spread using the cover that miners are known to be subject to false
+positives ans users will dismiss the AV alert. Always be on alert.
+The source code of cpuminer-opt is open for anyone to inspect.
 If you don't trust the software don't download it.

 The cryptographic hashing code has been taken from trusted sources but has been
@@ -21,7 +22,7 @@ required.
 Compile Instructions
 --------------------

-See INSTALL_LINUX or INSTALL_WINDOWS fror compile instruuctions
+See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions

 Requirements
 ------------
@@ -29,19 +30,141 @@ Requirements
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.

-64 bit Linux or Windows operating system. Apple, Android and Rpi are
-not supported. FreeBSD YMMV.
+64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
+are not supported. FreeBSD YMMV.
+
+Reporting bugs
+--------------
+
+Bugs can be reported by sending am email to JayDDee246@gmail.com or opening
+an issue in git: https://github.com/JayDDee/cpuminer-opt/issues
+
+Please include the following information:
+
+1. CPU model, operating system, cpuminer-opt version (must be latest),
+   binary file for Windows, changes to default build procedure for Linux.
+
+2. Exact comand line (except user and pw) and intial output showing
+   the above requested info.
+
+3. Additional program output showing any error messages or other
+   pertinent data.
+
+4. A clear description of the problem including history, scope,
+   persistence or intermittance, and reproduceability. 
+
+In simpler terms:
+
+What is it doing?
+What should it be doing instead?
+Did it work in a previous release?
+Does it happen for all algos? All pools? All options? Solo?
+Does it happen all the time?
+If not what makes it happen or not happen? 

 Change Log
 ----------

+v3.11.7
+
+Added yescryptr8g algo fotr KOTO, including support for block version 5.
+
+Added sha3d algo for BSHA3.
+
+Removed memcmp and clean_job checks from get_new_work, now only check job_id.
+
+Small improvement to sha512 and sha256 parallel implementations that don't
+use SHA.
+
+v3.11.6
+
+Fixed CPU temperature regression from v3.11.5.
+
+More improvements to share log. More compact, highlight incremented counter,
+block height when solved, job id when stale.
+
+v3.11.5
+
+Fixed AVX512 detection that could cause compilation errors on CPUs
+without AVX512.
+
+Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block
+is solved.
+Added share counter to share submitited & accepted logs
+Added job id to share submitted log.
+Share submitted log is no longer highlighted blue, there was too much blue.
+
+Another CPU temperature fix for Linux.
+
+Added bug reporting tips to RELEASE NOTES.
+
+v3.11.4
+
+Fixed scrypt segfault since v3.9.9.1.
+
+Stale shares counted and reported seperately from other rejected shares.
+
+Display of counters for solved blocks, rejects, stale shares suppressed in
+periodic summary when zero.
+
+v3.11.3
+
+Fixed x12 AVX2 again.
+
+More speed for allium: AVX2 +4%, AVX512 +6%, VAES +14%.
+
+Restored lost speed for x22i & x25x.
+
+v3.11.2
+
+Fixed x11gost (sib) AVX2 invalid shares.
+
+Fixed x16r, x16rv2, x16s, x16rt, x16rt-veil (veil), x21s.
+No shares were submitted when cube, shavite or echo were the first function
+in the hash order.
+
+Fixed all algos reporting stats problems when mining with SSE2.
+
+Faster Lyra2 AVX512: lyra2z +47%, lyra2rev3 +11%, allium +13%, x21s +6% 
+
+Other minor performance improvements.
+
+Known issue:
+
+Lyra2 AVX512 improvements paradoxically reduced performance on x22i and x25x.
+https://github.com/JayDDee/cpuminer-opt/issues/225
+
+v3.11.1
+
+Faster panama for x25x AVX2 & AVX512.
+
+Fixed echo VAES for Xevan.
+
+Removed support for scryptjane algo.
+
+Reverted macro implemtations of hash functions to SPH reference code
+for SSE2 versions of algos.
+
+v3.11.0
+
+Fixed x25x AVX512 lane 4 invalid shares.
+
+AVX512 for hex, phi2.
+
+VAES optimzation for Intel Icelake CPUs for most algos recently optimized
+with AVX512, source code only.
+
+v3.10.7
+
+AVX512 for x25x, lbry, x13bcd (bcd).
+
 v3.10.6

 Added support for SSL stratum: stratum+tcps://

 Added job id reporting again, but leaner, suppressed with --quiet.

-AVX512 for x21s, x22i, lyra2z, allium
+AVX512 for x21s, x22i, lyra2z, allium.

 Fixed share overflow warnings mining lbry with Ryzen (SHA).

--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -206,10 +206,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_QUARK:         register_quark_algo         ( gate ); break;
    case ALGO_QUBIT:         register_qubit_algo         ( gate ); break;
    case ALGO_SCRYPT:        register_scrypt_algo        ( gate ); break;
-    case ALGO_SCRYPTJANE:    register_scryptjane_algo    ( gate ); break;
    case ALGO_SHA256D:       register_sha256d_algo       ( gate ); break;
    case ALGO_SHA256Q:       register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:       register_sha256t_algo       ( gate ); break;
+    case ALGO_SHA3D:         register_sha3d_algo         ( gate ); break;
    case ALGO_SHAVITE3:      register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:         register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:        register_skein2_algo        ( gate ); break;
@@ -248,6 +248,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
 */
    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
+    case ALGO_YESCRYPTR8G:   register_yescryptr8g_algo   ( gate ); break;
    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
    case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
@@ -317,6 +318,7 @@ const char* const algo_alias_map[][2] =
  { "argon2d-crds",      "argon2d250"   },
  { "argon2d-dyn",       "argon2d500"   },
  { "argon2d-uis",       "argon2d4096"  },
+  { "bcd",               "x13bcd"       },
  { "bitcore",           "timetravel10" },
  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -121,54 +121,55 @@ void ( *hash_suw ) ( void*, const void* );

 // Allocate thread local buffers and other initialization specific to miner
 // threads.
-bool ( *miner_thread_init )      ( int );
+bool ( *miner_thread_init )     ( int );

 // Generate global blockheader from stratum data.
-void ( *stratum_gen_work )       ( struct stratum_ctx*, struct work* );
+void ( *stratum_gen_work )      ( struct stratum_ctx*, struct work* );

 // Get thread local copy of blockheader with unique nonce.
-void ( *get_new_work )           ( struct work*, struct work*, int, uint32_t*,
-                                   bool );
+void ( *get_new_work )          ( struct work*, struct work*, int, uint32_t* );

 // Return pointer to nonce in blockheader.
-uint32_t *( *get_nonceptr )      ( uint32_t* );
+uint32_t *( *get_nonceptr )     ( uint32_t* );

 // Decode getwork blockheader
-bool ( *work_decode )            ( const json_t*, struct work* );
+bool ( *work_decode )           ( const json_t*, struct work* );

 // Extra getwork data
-void ( *decode_extra_data )      ( struct work*, uint64_t* );
+void ( *decode_extra_data )     ( struct work*, uint64_t* );

-bool ( *submit_getwork_result )  ( CURL*, struct work* );
+bool ( *submit_getwork_result ) ( CURL*, struct work* );

-void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
+void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );

 // Increment extranonce
-void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
+void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );
+
+void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
+	                                uint32_t*, uint32_t, uint32_t,
+                                   unsigned char* );

-void ( *build_block_header )     ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t );
 // Build mining.submit message
-void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
+void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );

-char* ( *malloc_txs_request )    ( struct work* );
+char* ( *malloc_txs_request )   ( struct work* );

 // Big or little
-void ( *set_work_data_endian )   ( struct work* );
+void ( *set_work_data_endian )  ( struct work* );

-double ( *calc_network_diff )    ( struct work* );
+double ( *calc_network_diff )   ( struct work* );

 // Wait for first work
-bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );
+bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );

 // Diverge mining threads
-bool ( *do_this_thread )         ( int );
+bool ( *do_this_thread )        ( int );

 // After do_this_thread
-void ( *resync_threads )         ( struct work* );
+void ( *resync_threads )        ( struct work* );

-json_t* (*longpoll_rpc_call)     ( CURL*, int*, char* );
-bool ( *stratum_handle_response )( json_t* );
+json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
+bool ( *stratum_handle_response ) ( json_t* );
 set_t optimizations;
 int  ( *get_work_data_size )     ();
 int  ntime_index;
@@ -225,7 +226,7 @@ uint32_t *std_get_nonceptr( uint32_t *work_data );
 uint32_t *jr2_get_nonceptr( uint32_t *work_data );

 void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
-                       uint32_t* end_nonce_ptr, bool clean_job );
+                       uint32_t* end_nonce_ptr );
 void jr2_get_new_work( struct work *work, struct work *g_work, int thr_id,
                       uint32_t* end_nonce_ptr );

@@ -256,7 +257,8 @@ double std_calc_network_diff( struct work *work );

 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
-   	                       uint32_t ntime,      uint32_t nbits );
+   	                       uint32_t ntime,      uint32_t nbits,
+                             unsigned char *final_sapling_hash );

 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -62,9 +62,7 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
 		argon2hash(hash, endiandata);
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			work_set_target_ratio(work, hash);
-			return 1;
+         submit_solution( work, hash, mythr );
 		}
 		nonce++;
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -13,7 +13,7 @@ void blakehash_4way(void *state, const void *input)
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r14_4way_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -36,7 +36,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -37,8 +37,6 @@
 #ifndef __BLAKE_HASH_4WAY__
 #define __BLAKE_HASH_4WAY__ 1

-//#ifdef __SSE4_2__
-
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -51,46 +49,41 @@ extern "C"{

 #define SPH_SIZE_blake512   512

-// With SSE4.2 only Blake-256 4 way is available.
-// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
-
-// Blake-256 4 way
+//////////////////////////
+//
+//   Blake-256 4 way SSE2

 typedef struct {
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
-//   __m128i buf[16] __attribute__ ((aligned (64)));
-//   __m128i H[8];
-//   __m128i S[4];    
   size_t ptr;
   uint32_t T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context __attribute__ ((aligned (64)));

-// Default 14 rounds
+// Default, 14 rounds, blake, decred
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
 void blake256_4way_update(void *ctx, const void *data, size_t len);
-#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
 void blake256r14_4way_update(void *cc, const void *data, size_t len);
-#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
 void blake256r8_4way_update(void *cc, const void *data, size_t len);
-#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

-// Blake-256 8 way
+//////////////////////////
+//
+//   Blake-256 8 way AVX2

 typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
@@ -104,7 +97,6 @@ typedef struct {
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
-#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);

 // 14 rounds, blake, decred
@@ -117,10 +109,9 @@ void blake256r14_8way_close(void *cc, void *dst);
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
 void blake256r8_8way_update(void *cc, const void *data, size_t len);
-#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);

-// Blake-512 4 way
+// Blake-512 4 way AVX2

 typedef struct {
   __m256i buf[16];
@@ -134,14 +125,15 @@ typedef blake_4way_big_context blake512_4way_context;

 void blake512_4way_init( blake_4way_big_context *sc );
 void blake512_4way_update( void *cc, const void *data, size_t len );
-#define blake512_4way blake512_4way_update
 void blake512_4way_close( void *cc, void *dst );
-void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                      void *dst );
+void blake512_4way_full( blake_4way_big_context *sc, void * dst,
+                         const void *data, size_t len );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-//Blake-256 16 way
+////////////////////////////
+//
+//   Blake-256 16 way AVX512

 typedef struct {
   __m512i buf[16];
@@ -169,8 +161,9 @@ void blake256r8_16way_init(void *cc);
 void blake256r8_16way_update(void *cc, const void *data, size_t len);
 void blake256r8_16way_close(void *cc, void *dst);

-
-// Blake-512 8 way
+////////////////////////////
+//
+//// Blake-512 8 way AVX512

 typedef struct {
   __m512i buf[16];
@@ -185,12 +178,10 @@ typedef blake_8way_big_context blake512_8way_context;
 void blake512_8way_init( blake_8way_big_context *sc );
 void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
-void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
-                                      void *dst );
+void blake512_8way_full( blake_8way_big_context *sc, void * dst,
+                        const void *data, size_t len );

 #endif  // AVX512
-
-
 #endif  // AVX2

 #ifdef __cplusplus
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -842,7 +842,8 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
 }

 static void
-blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
+blake32_4way( blake_4way_small_context *ctx, const void *data,
+              size_t len )
 {
   __m128i *buf = (__m128i*)ctx->buf;
   size_t  bptr = ctx->ptr<<2;
@@ -1237,7 +1238,7 @@ blake256_4way_init(void *ctx)
 }

 void
-blake256_4way(void *ctx, const void *data, size_t len)
+blake256_4way_update(void *ctx, const void *data, size_t len)
 {
 	blake32_4way(ctx, data, len);
 }
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -39,7 +39,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
      blake2b_8way_final( &ctx, hash );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
@@ -94,7 +94,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
      blake2b_4way_final( &ctx, hash );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -43,17 +43,14 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,

 	do {
 		be32enc(&endiandata[19], n);
-		//blake2b_hash_end(vhashcpu, endiandata);
 		blake2b_hash(vhashcpu, endiandata);

-		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
-			work_set_target_ratio(work, vhashcpu);
-			*hashes_done = n - first_nonce + 1;
+		if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
+      {
 			pdata[19] = n;
-			return 1;
-		}
-		n++;
-
+         submit_solution( work, vhashcpu, mythr );
+      }
+      n++;
 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
 	pdata[19] = n;
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -14,7 +14,6 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1

-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)

 #include "simd-utils.h"
@@ -132,6 +131,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
 }
 #endif

-#endif  // __SSE4_2__
+#endif  // __SSE2__

 #endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -56,7 +56,7 @@ int scanhash_blake2s( struct work *work,
 	do {
 		be32enc(&endiandata[19], n);
 		blake2s_hash( hash64, endiandata );
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+		if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
 			return true;
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -267,22 +267,22 @@ static const sph_u64 CB[16] = {
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n

-#define CB0   SPH_C64(0x243F6A8885A308D3)
-#define CB1   SPH_C64(0x13198A2E03707344)
-#define CB2   SPH_C64(0xA4093822299F31D0)
-#define CB3   SPH_C64(0x082EFA98EC4E6C89)
-#define CB4   SPH_C64(0x452821E638D01377)
-#define CB5   SPH_C64(0xBE5466CF34E90C6C)
-#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
-#define CB7   SPH_C64(0x3F84D5B5B5470917)
-#define CB8   SPH_C64(0x9216D5D98979FB1B)
-#define CB9   SPH_C64(0xD1310BA698DFB5AC)
-#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
-#define CBB   SPH_C64(0xB8E1AFED6A267E96)
-#define CBC   SPH_C64(0xBA7C9045F12C7F99)
-#define CBD   SPH_C64(0x24A19947B3916CF7)
-#define CBE   SPH_C64(0x0801F2E2858EFC16)
-#define CBF   SPH_C64(0x636920D871574E69)
+#define CB0   0x243F6A8885A308D3
+#define CB1   0x13198A2E03707344
+#define CB2   0xA4093822299F31D0
+#define CB3   0x082EFA98EC4E6C89
+#define CB4   0x452821E638D01377
+#define CB5   0xBE5466CF34E90C6C
+#define CB6   0xC0AC29B7C97C50DD
+#define CB7   0x3F84D5B5B5470917
+#define CB8   0x9216D5D98979FB1B
+#define CB9   0xD1310BA698DFB5AC
+#define CBA   0x2FFD72DBD01ADFB7
+#define CBB   0xB8E1AFED6A267E96
+#define CBC   0xBA7C9045F12C7F99
+#define CBD   0x24A19947B3916CF7
+#define CBE   0x0801F2E2858EFC16
+#define CBF   0x636920D871574E69

 #define READ_STATE64(state)   do { \
      H0 = (state)->H[0]; \
@@ -349,9 +349,9 @@ static const sph_u64 CB[16] = {
 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m512i S0, S1, S2, S3; \
-   sph_u64 T0, T1;
+   uint64_t T0, T1;

-#define COMPRESS64_8WAY   do \
+#define COMPRESS64_8WAY( buf )   do \
 { \
  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -424,6 +424,84 @@ static const sph_u64 CB[16] = {
  H7 = mm512_xor4( VF, V7, S3, H7 ); \
 } while (0)

+void blake512_8way_compress( blake_8way_big_context *sc )
+{ 
+  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m512i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m512i V8, V9, VA, VB, VC, VD, VE, VF;
+  __m512i shuf_bswap64;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
+  V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
+  VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
+  VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
+  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
+                            m512_const1_64( CB4 ) );
+  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
+                            m512_const1_64( CB5 ) );
+  VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
+                            m512_const1_64( CB6 ) );
+  VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
+                            m512_const1_64( CB7 ) );
+
+  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
+  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
+  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
+  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
+  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
+  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
+  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
+  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
+  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
+  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
+  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
+  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
+  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
+  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
+  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
+  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+  ROUND_B_8WAY(6);
+  ROUND_B_8WAY(7);
+  ROUND_B_8WAY(8);
+  ROUND_B_8WAY(9);
+  ROUND_B_8WAY(0);
+  ROUND_B_8WAY(1);
+  ROUND_B_8WAY(2);
+  ROUND_B_8WAY(3);
+  ROUND_B_8WAY(4);
+  ROUND_B_8WAY(5);
+
+  sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
+}
+
 void blake512_8way_init( blake_8way_big_context *sc )
 {
   __m512i zero = m512_zero;
@@ -455,39 +533,43 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )

   const int buf_size = 128;  //  sizeof/8

+// 64, 80 bytes: 1st pass copy data. 2nd pass copy padding and compress.   
+// 128 bytes: 1st pass copy data, compress. 2nd pass copy padding, compress.
+   
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-   memcpy_512( buf + (ptr>>3), vdata, len>>3 );
-   ptr += len;
-   sc->ptr = ptr;
-   return;
+      memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-   size_t clen;
+      size_t clen;

-   clen = buf_size - ptr;
-   if ( clen > len )
+      clen = buf_size - ptr;
+      if ( clen > len )
      clen = len;
-   memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
-   ptr += clen;
-   vdata = vdata + (clen>>3);
-   len -= clen;
-   if ( ptr == buf_size )
-        {
-      if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
-         T1 = SPH_T64(T1 + 1);
-      COMPRESS64_8WAY;
-      ptr = 0;
-   }
+      memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+      ptr += clen;
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         if ( ( T0 = T0 + 1024 ) < 1024 )
+            T1 = T1 + 1;
+         COMPRESS64_8WAY( buf );
+         ptr = 0;
+      }
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
-}
+
+   }

 static void
 blake64_8way_close( blake_8way_big_context *sc, void *dst )
@@ -495,26 +577,22 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   __m512i buf[16];
   size_t ptr;
   unsigned bit_len;
-//   uint64_t z, zz;
-   sph_u64 th, tl;
+   uint64_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-//   z = 0x80 >> n;
-//   zz = ((ub & -z) | z) & 0xFF;
-//   buf[ptr>>3] = _mm512_set1_epi64( zz );
   buf[ptr>>3] = m512_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
   {
-   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-   sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
   }
   else if ( sc->T0 == 0 )
   {
-   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
-   sc->T1 = SPH_T64(sc->T1 - 1);
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
   }
   else
   {
@@ -535,8 +613,8 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
       memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+       sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
       memset_zero_512( buf, 112>>3 );
       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
       buf[112>>3] = m512_const1_64( bswap_64( th ) );
@@ -547,6 +625,79 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }

+// init, update & close
+void blake512_8way_full( blake_8way_big_context *sc, void * dst, 
+                        const void *data, size_t len )
+{
+   
+// init
+
+   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m512i( sc->S, 0 ) = m512_zero;
+   casti_m512i( sc->S, 1 ) = m512_zero;
+   casti_m512i( sc->S, 2 ) = m512_zero;
+   casti_m512i( sc->S, 3 ) = m512_zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_512( sc->buf, (__m512i*)data, len>>3 );
+   sc->ptr = len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+            sc->T1 = sc->T1 + 1;
+      blake512_8way_compress( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m512_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr64 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+   sc->T1 = sc->T1 - 1;
+   }
+   else
+      sc->T0 -= 1024 - bit_len;
+
+   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
+   sc->buf[14] = m512_const1_64( bswap_64( th ) );
+   sc->buf[15] = m512_const1_64( bswap_64( tl ) );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_8way_compress( sc );
+   
+   mm512_block_bswap_64( (__m512i*)dst, sc->H );
+}
+   
 void
 blake512_8way_update(void *cc, const void *data, size_t len)
 {
@@ -555,12 +706,6 @@ blake512_8way_update(void *cc, const void *data, size_t len)

 void
 blake512_8way_close(void *cc, void *dst)
-{
-   blake512_8way_addbits_and_close(cc, 0, 0, dst);
-}
-
-void
-blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
   blake64_8way_close(cc, dst);
 }
@@ -596,7 +741,7 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
-	sph_u64 T0, T1;
+	uint64_t T0, T1;

 #define COMPRESS64_4WAY   do \
 { \
@@ -670,6 +815,81 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 } while (0)


+void blake512_4way_compress( blake_4way_big_context *sc )
+{
+  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
+  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
+  __m256i V0, V1, V2, V3, V4, V5, V6, V7;
+  __m256i V8, V9, VA, VB, VC, VD, VE, VF;
+  __m256i shuf_bswap64;
+
+  V0 = sc->H[0];
+  V1 = sc->H[1];
+  V2 = sc->H[2];
+  V3 = sc->H[3];
+  V4 = sc->H[4];
+  V5 = sc->H[5];
+  V6 = sc->H[6];
+  V7 = sc->H[7];
+  V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
+  V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
+  VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
+  VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
+  VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
+                             m256_const1_64( CB4 ) );
+  VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
+                             m256_const1_64( CB5 ) );
+  VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
+                             m256_const1_64( CB6 ) );
+  VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
+                             m256_const1_64( CB7 ) );
+  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
+  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
+  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
+  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
+  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
+  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
+  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
+  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
+  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
+  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
+  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
+  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
+  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
+  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
+  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
+  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+
+  ROUND_B_4WAY(0);
+  ROUND_B_4WAY(1);
+  ROUND_B_4WAY(2);
+  ROUND_B_4WAY(3);
+  ROUND_B_4WAY(4);
+  ROUND_B_4WAY(5);
+  ROUND_B_4WAY(6);
+  ROUND_B_4WAY(7);
+  ROUND_B_4WAY(8);
+  ROUND_B_4WAY(9);
+  ROUND_B_4WAY(0);
+  ROUND_B_4WAY(1);
+  ROUND_B_4WAY(2);
+  ROUND_B_4WAY(3);
+  ROUND_B_4WAY(4);
+  ROUND_B_4WAY(5);
+
+  sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
+  sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
+  sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
+  sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
+  sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
+  sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
+  sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
+  sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
+}
+
 void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
@@ -681,10 +901,12 @@ void blake512_4way_init( blake_4way_big_context *sc )
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
+
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -703,31 +925,31 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
-	ptr += len;
-	sc->ptr = ptr;
-	return;
+   	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+	   ptr += len;
+	   sc->ptr = ptr;
+	   return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-	size_t clen;
+   	size_t clen;

-	clen = buf_size - ptr;
-	if ( clen > len )
-		clen = len;
-	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
-	ptr += clen;
-	vdata = vdata + (clen>>3);
-	len -= clen;
-	if (ptr == buf_size )
-        {
-		if ((T0 = SPH_T64(T0 + 1024)) < 1024)
-			T1 = SPH_T64(T1 + 1);
-		COMPRESS64_4WAY;
-		ptr = 0;
-	}
+	   clen = buf_size - ptr;
+	   if ( clen > len )
+		   clen = len;
+   	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+	   ptr += clen;
+	   vdata = vdata + (clen>>3);
+	   len -= clen;
+	   if ( ptr == buf_size )
+      {
+		   if ( (T0 = T0 + 1024 ) < 1024 )
+			   T1 = SPH_T64(T1 + 1);
+	   	COMPRESS64_4WAY;
+		   ptr = 0;
+	   }
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
@@ -739,7 +961,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   sph_u64 th, tl;
+   uint64_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -748,13 +970,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+	sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+	sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
-	sc->T1 = SPH_T64(sc->T1 - 1);
+	sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+	sc->T1 = sc->T1 - 1;
   } 
   else
   {
@@ -788,13 +1010,77 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

-/*
-void
-blake512_4way_init(void *cc)
+// init, update & close
+void blake512_4way_full( blake_4way_big_context *sc, void * dst,
+                         const void *data, size_t len )
 {
-	blake64_4way_init(cc, IV512, salt_zero_big);
+
+// init
+
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   casti_m256i( sc->S, 0 ) = m256_zero;
+   casti_m256i( sc->S, 1 ) = m256_zero;
+   casti_m256i( sc->S, 2 ) = m256_zero;
+   casti_m256i( sc->S, 3 ) = m256_zero;
+
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+
+// update
+
+   memcpy_256( sc->buf, (__m256i*)data, len>>3 );
+   sc->ptr += len;
+   if ( len == 128 )
+   {
+      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+         sc->T1 =  sc->T1 + 1;
+      blake512_4way_compress( sc );
+      sc->ptr = 0;
+   }
+
+// close
+
+   size_t ptr64 = sc->ptr >> 3;
+   unsigned bit_len;
+   uint64_t th, tl;
+
+   bit_len = sc->ptr << 3;
+   sc->buf[ptr64] = m256_const1_64( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+   if ( sc->ptr == 0 )
+   {
+      sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+      sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   }
+   else if ( sc->T0 == 0 )
+   {
+      sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
+      sc->T1 = sc->T1 - 1;
+   }
+   else
+        sc->T0 -= 1024 - bit_len;
+
+   memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
+   sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
+   sc->buf[14] = m256_const1_64( bswap_64( th ) );
+   sc->buf[15] = m256_const1_64( bswap_64( tl ) );
+
+   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
+       sc->T1 = sc->T1 + 1;
+
+   blake512_4way_compress( sc );
+
+   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }
-*/

 void
 blake512_4way_update(void *cc, const void *data, size_t len)
@@ -806,17 +1092,8 @@ void
 blake512_4way_close(void *cc, void *dst)
 {
   blake64_4way_close( cc, dst );
-
-//   blake512_4way_addbits_and_close(cc, dst);
 }

-/*
-void
-blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	blake64_4way_close(cc, ub, n, dst, 8);
-}
-*/
 #ifdef __cplusplus
 }
 #endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -14,7 +14,7 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -71,7 +71,7 @@ void blakecoin_8way_hash( void *state, const void *input )
     blake256r8_8way_context ctx;

     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
-     blake256r8_8way( &ctx, input + (64<<3), 16 );
+     blake256r8_8way_update( &ctx, input + (64<<3), 16 );
     blake256r8_8way_close( &ctx, vhash );

     dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96, state+128,
@@ -95,7 +95,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake256r8_8way_init( &blakecoin_8w_ctx );
-   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
+   blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -21,7 +21,7 @@ void decred_hash_4way( void *state, const void *input )
     blake256_4way_context ctx __attribute__ ((aligned (64)));

     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way( &ctx, tail, tail_len );
+     blake256_4way_update( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -46,7 +46,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
-   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
+   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -77,25 +77,15 @@ int scanhash_decred( struct work *work, uint32_t max_nonce,
                be32enc(&endiandata[k], pdata[k]);
 #endif

-#ifdef DEBUG_ALGO
-        if (!thr_id) applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
-#endif
-
        do {
                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
                endiandata[DECRED_NONCE_INDEX] = n;
                decred_hash(hash32, endiandata);

-                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
-                        work_set_target_ratio(work, hash32);
-                        *hashes_done = n - first_nonce + 1;
-#ifdef DEBUG_ALGO
-                        applog(LOG_BLUE, "Nonce : %08x %08x", n, swab32(n));
-                        applog_hash(ptarget);
-                        applog_compare_hash(hash32, ptarget);
-#endif
-                        pdata[DECRED_NONCE_INDEX] = n;
-                        return 1;
+                if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
+                {
+                   pdata[DECRED_NONCE_INDEX] = n;
+                   submit_solution( work, hash32, mythr );
                }

                n++;
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -22,23 +22,23 @@ extern void pentablakehash_4way( void *output, const void *input )


     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, input, 80 );
+     blake512_4way_update( &ctx, input, 80 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_update( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
--- a/algo/blake/sse2/blake.c
+++ b/algo/blake/sse2/blake.c
@@ -1,476 +0,0 @@
-/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
-/*
- * BLAKE implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-#include <stddef.h>
-#include <string.h>
-#include <limits.h>
-
-#include "../sph_blake.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-static const sph_u64 blkIV512[8] = {
-	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
-};
-
-#define Z00   0
-#define Z01   1
-#define Z02   2
-#define Z03   3
-#define Z04   4
-#define Z05   5
-#define Z06   6
-#define Z07   7
-#define Z08   8
-#define Z09   9
-#define Z0A   A
-#define Z0B   B
-#define Z0C   C
-#define Z0D   D
-#define Z0E   E
-#define Z0F   F
-
-#define Z10   E
-#define Z11   A
-#define Z12   4
-#define Z13   8
-#define Z14   9
-#define Z15   F
-#define Z16   D
-#define Z17   6
-#define Z18   1
-#define Z19   C
-#define Z1A   0
-#define Z1B   2
-#define Z1C   B
-#define Z1D   7
-#define Z1E   5
-#define Z1F   3
-
-#define Z20   B
-#define Z21   8
-#define Z22   C
-#define Z23   0
-#define Z24   5
-#define Z25   2
-#define Z26   F
-#define Z27   D
-#define Z28   A
-#define Z29   E
-#define Z2A   3
-#define Z2B   6
-#define Z2C   7
-#define Z2D   1
-#define Z2E   9
-#define Z2F   4
-
-#define Z30   7
-#define Z31   9
-#define Z32   3
-#define Z33   1
-#define Z34   D
-#define Z35   C
-#define Z36   B
-#define Z37   E
-#define Z38   2
-#define Z39   6
-#define Z3A   5
-#define Z3B   A
-#define Z3C   4
-#define Z3D   0
-#define Z3E   F
-#define Z3F   8
-
-#define Z40   9
-#define Z41   0
-#define Z42   5
-#define Z43   7
-#define Z44   2
-#define Z45   4
-#define Z46   A
-#define Z47   F
-#define Z48   E
-#define Z49   1
-#define Z4A   B
-#define Z4B   C
-#define Z4C   6
-#define Z4D   8
-#define Z4E   3
-#define Z4F   D
-
-#define Z50   2
-#define Z51   C
-#define Z52   6
-#define Z53   A
-#define Z54   0
-#define Z55   B
-#define Z56   8
-#define Z57   3
-#define Z58   4
-#define Z59   D
-#define Z5A   7
-#define Z5B   5
-#define Z5C   F
-#define Z5D   E
-#define Z5E   1
-#define Z5F   9
-
-#define Z60   C
-#define Z61   5
-#define Z62   1
-#define Z63   F
-#define Z64   E
-#define Z65   D
-#define Z66   4
-#define Z67   A
-#define Z68   0
-#define Z69   7
-#define Z6A   6
-#define Z6B   3
-#define Z6C   9
-#define Z6D   2
-#define Z6E   8
-#define Z6F   B
-
-#define Z70   D
-#define Z71   B
-#define Z72   7
-#define Z73   E
-#define Z74   C
-#define Z75   1
-#define Z76   3
-#define Z77   9
-#define Z78   5
-#define Z79   0
-#define Z7A   F
-#define Z7B   4
-#define Z7C   8
-#define Z7D   6
-#define Z7E   2
-#define Z7F   A
-
-#define Z80   6
-#define Z81   F
-#define Z82   E
-#define Z83   9
-#define Z84   B
-#define Z85   3
-#define Z86   0
-#define Z87   8
-#define Z88   C
-#define Z89   2
-#define Z8A   D
-#define Z8B   7
-#define Z8C   1
-#define Z8D   4
-#define Z8E   A
-#define Z8F   5
-
-#define Z90   A
-#define Z91   2
-#define Z92   8
-#define Z93   4
-#define Z94   7
-#define Z95   6
-#define Z96   1
-#define Z97   5
-#define Z98   F
-#define Z99   B
-#define Z9A   9
-#define Z9B   E
-#define Z9C   3
-#define Z9D   C
-#define Z9E   D
-#define Z9F   0
-
-#define Mx(r, i)    Mx_(Z ## r ## i)
-#define Mx_(n)      Mx__(n)
-#define Mx__(n)     M ## n
-
-#define CSx(r, i)   CSx_(Z ## r ## i)
-#define CSx_(n)     CSx__(n)
-#define CSx__(n)    CS ## n
-
-#define CS0   SPH_C32(0x243F6A88)
-#define CS1   SPH_C32(0x85A308D3)
-#define CS2   SPH_C32(0x13198A2E)
-#define CS3   SPH_C32(0x03707344)
-#define CS4   SPH_C32(0xA4093822)
-#define CS5   SPH_C32(0x299F31D0)
-#define CS6   SPH_C32(0x082EFA98)
-#define CS7   SPH_C32(0xEC4E6C89)
-#define CS8   SPH_C32(0x452821E6)
-#define CS9   SPH_C32(0x38D01377)
-#define CSA   SPH_C32(0xBE5466CF)
-#define CSB   SPH_C32(0x34E90C6C)
-#define CSC   SPH_C32(0xC0AC29B7)
-#define CSD   SPH_C32(0xC97C50DD)
-#define CSE   SPH_C32(0x3F84D5B5)
-#define CSF   SPH_C32(0xB5470917)
-
-
-
-#define CBx(r, i)   CBx_(Z ## r ## i)
-#define CBx_(n)     CBx__(n)
-#define CBx__(n)    CB ## n
-
-#define CB0   SPH_C64(0x243F6A8885A308D3)
-#define CB1   SPH_C64(0x13198A2E03707344)
-#define CB2   SPH_C64(0xA4093822299F31D0)
-#define CB3   SPH_C64(0x082EFA98EC4E6C89)
-#define CB4   SPH_C64(0x452821E638D01377)
-#define CB5   SPH_C64(0xBE5466CF34E90C6C)
-#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
-#define CB7   SPH_C64(0x3F84D5B5B5470917)
-#define CB8   SPH_C64(0x9216D5D98979FB1B)
-#define CB9   SPH_C64(0xD1310BA698DFB5AC)
-#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
-#define CBB   SPH_C64(0xB8E1AFED6A267E96)
-#define CBC   SPH_C64(0xBA7C9045F12C7F99)
-#define CBD   SPH_C64(0x24A19947B3916CF7)
-#define CBE   SPH_C64(0x0801F2E2858EFC16)
-#define CBF   SPH_C64(0x636920D871574E69)
-
-
-#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
-		a = SPH_T32(a + b + (m0 ^ c1)); \
-		d = SPH_ROTR32(d ^ a, 16); \
-		c = SPH_T32(c + d); \
-		b = SPH_ROTR32(b ^ c, 12); \
-		a = SPH_T32(a + b + (m1 ^ c0)); \
-		d = SPH_ROTR32(d ^ a, 8); \
-		c = SPH_T32(c + d); \
-		b = SPH_ROTR32(b ^ c, 7); \
-	} while (0)
-
-#define ROUND_S(r)   do { \
-		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
-		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
-		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
-		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
-		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
-		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
-		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
-		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-	} while (0)
-
-
-
-#define GB(m0, m1, c0, c1, a, b, c, d)   do { \
-		a = SPH_T64(a + b + (m0 ^ c1)); \
-		d = SPH_ROTR64(d ^ a, 32); \
-		c = SPH_T64(c + d); \
-		b = SPH_ROTR64(b ^ c, 25); \
-		a = SPH_T64(a + b + (m1 ^ c0)); \
-		d = SPH_ROTR64(d ^ a, 16); \
-		c = SPH_T64(c + d); \
-		b = SPH_ROTR64(b ^ c, 11); \
-	} while (0)
-
-#define ROUND_B(r)   do { \
-		GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
-		GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
-		GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
-		GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
-		GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
-		GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
-		GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
-		GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
-	} while (0)
-
-
-#define COMPRESS64   do { \
-                int b=0; \
-		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
-		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
-		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
-		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
-		V0 = blkH0, \
-		V1 = blkH1, \
-		V2 = blkH2, \
-		V3 = blkH3, \
-		V4 = blkH4, \
-		V5 = blkH5, \
-		V6 = blkH6, \
-		V7 = blkH7; \
-		V8 = blkS0 ^ CB0, \
-		V9 = blkS1 ^ CB1, \
-		VA = blkS2 ^ CB2, \
-		VB = blkS3 ^ CB3, \
-		VC = hashctA ^ CB4, \
-		VD = hashctA ^ CB5, \
-		VE = hashctB ^ CB6, \
-		VF = hashctB ^ CB7; \
-		M0 = sph_dec64be_aligned(buf +   0), \
-		M1 = sph_dec64be_aligned(buf +   8), \
-		M2 = sph_dec64be_aligned(buf +  16), \
-		M3 = sph_dec64be_aligned(buf +  24), \
-		M4 = sph_dec64be_aligned(buf +  32), \
-		M5 = sph_dec64be_aligned(buf +  40), \
-		M6 = sph_dec64be_aligned(buf +  48), \
-		M7 = sph_dec64be_aligned(buf +  56), \
-		M8 = sph_dec64be_aligned(buf +  64), \
-		M9 = sph_dec64be_aligned(buf +  72), \
-		MA = sph_dec64be_aligned(buf +  80), \
-		MB = sph_dec64be_aligned(buf +  88), \
-		MC = sph_dec64be_aligned(buf +  96), \
-		MD = sph_dec64be_aligned(buf + 104), \
-		ME = sph_dec64be_aligned(buf + 112), \
-		MF = sph_dec64be_aligned(buf + 120); \
-                /* loop once and a half */ \
-                /* save some space */ \
-                for (;;) { \
-		    ROUND_B(0); \
-		    ROUND_B(1); \
-		    ROUND_B(2); \
-		    ROUND_B(3); \
-		    ROUND_B(4); \
-		    ROUND_B(5); \
-                    if (b)  break; \
-                    b = 1; \
-		    ROUND_B(6); \
-		    ROUND_B(7); \
-		    ROUND_B(8); \
-		    ROUND_B(9); \
-                }; \
-		blkH0 ^= blkS0 ^ V0 ^ V8, \
-		blkH1 ^= blkS1 ^ V1 ^ V9, \
-		blkH2 ^= blkS2 ^ V2 ^ VA, \
-		blkH3 ^= blkS3 ^ V3 ^ VB, \
-		blkH4 ^= blkS0 ^ V4 ^ VC, \
-		blkH5 ^= blkS1 ^ V5 ^ VD, \
-		blkH6 ^= blkS2 ^ V6 ^ VE, \
-		blkH7 ^= blkS3 ^ V7 ^ VF; \
-	} while (0)
-/*
-*/
-#define DECL_BLK \
-	sph_u64 blkH0; \
-	sph_u64 blkH1; \
-	sph_u64 blkH2; \
-	sph_u64 blkH3; \
-	sph_u64 blkH4; \
-	sph_u64 blkH5; \
-	sph_u64 blkH6; \
-	sph_u64 blkH7; \
-	sph_u64 blkS0; \
-	sph_u64 blkS1; \
-	sph_u64 blkS2; \
-	sph_u64 blkS3; \
-
-/* load initial constants */
-#define BLK_I \
-do { \
-    blkH0 = SPH_C64(0x6A09E667F3BCC908); \
-    blkH1 = SPH_C64(0xBB67AE8584CAA73B); \
-    blkH2 = SPH_C64(0x3C6EF372FE94F82B); \
-    blkH3 = SPH_C64(0xA54FF53A5F1D36F1); \
-    blkH4 = SPH_C64(0x510E527FADE682D1); \
-    blkH5 = SPH_C64(0x9B05688C2B3E6C1F); \
-    blkH6 = SPH_C64(0x1F83D9ABFB41BD6B); \
-    blkH7 = SPH_C64(0x5BE0CD19137E2179); \
-    blkS0 = 0; \
-    blkS1 = 0; \
-    blkS2 = 0; \
-    blkS3 = 0; \
-    hashctB = SPH_T64(0- 1); \
-} while (0)
-
-/* copy in 80 for initial hash */
-#define BLK_W \
-do { \
-    memcpy(hashbuf, input, 80); \
-    hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 80*8; \
-    hashptr = 80; \
-} while (0)
-
-/* copy in 64 for looped hash */
-#define BLK_U \
-do { \
-    memcpy(hashbuf, hash , 64); \
-    hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 64*8; \
-    hashptr = 64; \
-} while (0)
-
-/* blake compress function */
-/* hash = blake512(loaded) */
-#define BLK_C \
-do { \
-    \
-    union { \
-        unsigned char buf[128]; \
-        sph_u64 dummy; \
-    } u; \
-    size_t ptr; \
-    unsigned bit_len; \
- \
-    ptr = hashptr; \
-    bit_len = ((unsigned)ptr << 3) + 0; \
-    u.buf[ptr] = ((0 & -(0x80)) | (0x80)) & 0xFF; \
-    memset(u.buf + ptr + 1, 0, 111 - ptr); \
-    u.buf[111] |= 1; \
-    sph_enc64be_aligned(u.buf + 112, 0); \
-    sph_enc64be_aligned(u.buf + 120, bit_len); \
-    do { \
-    const void *data = u.buf + ptr; \
-    unsigned char *buf; \
-    buf = hashbuf; \
-    size_t clen; \
-    clen = (sizeof(char)*128) - hashptr; \
-    memcpy(buf + hashptr, data, clen); \
-    hashctA = SPH_T64(hashctA + 1024); \
-    hashctB = SPH_T64(hashctB + 1); \
-    COMPRESS64; \
-    } while (0); \
-    /* end blake64(sc, u.buf + ptr, 128 - ptr); */ \
-    sph_enc64be((unsigned char*)(hash) + (0 << 3), blkH0), \
-    sph_enc64be((unsigned char*)(hash) + (1 << 3), blkH1); \
-    sph_enc64be((unsigned char*)(hash) + (2 << 3), blkH2), \
-    sph_enc64be((unsigned char*)(hash) + (3 << 3), blkH3); \
-    sph_enc64be((unsigned char*)(hash) + (4 << 3), blkH4), \
-    sph_enc64be((unsigned char*)(hash) + (5 << 3), blkH5); \
-    sph_enc64be((unsigned char*)(hash) + (6 << 3), blkH6), \
-    sph_enc64be((unsigned char*)(hash) + (7 << 3), blkH7); \
-} while (0) 
-
-
-#ifdef __cplusplus
-}
-#endif
--- a/algo/blake/sse2/blake/sse41/api.h
+++ b/algo/blake/sse2/blake/sse41/api.h
@@ -1,2 +0,0 @@
-#define CRYPTO_BYTES 64
-
--- a/algo/blake/sse2/blake/sse41/architectures
+++ b/algo/blake/sse2/blake/sse41/architectures
@@ -1,2 +0,0 @@
-amd64
-x86
--- a/algo/blake/sse2/blake/sse41/config.h
+++ b/algo/blake/sse2/blake/sse41/config.h
@@ -1,8 +0,0 @@
-#ifndef __BLAKE512_CONFIG_H__
-#define __BLAKE512_CONFIG_H__
-
-#define AVOID_BRANCHING 1
-//#define HAVE_XOP 1
-
-#endif
-
--- a/algo/blake/sse2/blake/sse41/hash.c
+++ b/algo/blake/sse2/blake/sse41/hash.c
@@ -1,287 +0,0 @@
-
-#include "hash.h"
-/*
-#ifndef NOT_SUPERCOP
-
-#include "crypto_hash.h"
-#include "crypto_uint64.h"
-#include "crypto_uint32.h"
-#include "crypto_uint8.h"
-
-typedef crypto_uint64 u64;
-typedef crypto_uint32 u32;
-typedef crypto_uint8 u8; 
-
-#else
-
-typedef unsigned long long u64; 
-typedef unsigned int u32; 
-typedef unsigned char u8; 
-
-#endif
-*/
-#define U8TO32(p) \
-  (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
-   ((u32)((p)[2]) <<  8) | ((u32)((p)[3])      ))
-#define U8TO64(p) \
-  (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
-#define U32TO8(p, v) \
-    (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
-    (p)[2] = (u8)((v) >>  8); (p)[3] = (u8)((v)      ); 
-#define U64TO8(p, v) \
-    U32TO8((p),     (u32)((v) >> 32));	\
-    U32TO8((p) + 4, (u32)((v)      )); 
-/*
-typedef struct  
-{ 
-	__m128i h[4];
-  u64 s[4], t[2];
-  u32 buflen, nullt;
-  u8 buf[128];
-} state __attribute__ ((aligned (64)));
-*/
-static const u8 padding[129] =
-{ 
-	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-};
-
-static inline int blake512_compress( hashState_blake * state, const u8 * datablock ) 
-{
-
-  __m128i row1l,row1h;
-  __m128i row2l,row2h;
-  __m128i row3l,row3h;
-  __m128i row4l,row4h;
-
-  const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
-  const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-
-  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
-  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
-  __m128i b0, b1, b2, b3;
-
-  m0 = _mm_loadu_si128((__m128i*)(datablock +   0));
-  m1 = _mm_loadu_si128((__m128i*)(datablock +  16));
-  m2 = _mm_loadu_si128((__m128i*)(datablock +  32));
-  m3 = _mm_loadu_si128((__m128i*)(datablock +  48));
-  m4 = _mm_loadu_si128((__m128i*)(datablock +  64));
-  m5 = _mm_loadu_si128((__m128i*)(datablock +  80));
-  m6 = _mm_loadu_si128((__m128i*)(datablock +  96));
-  m7 = _mm_loadu_si128((__m128i*)(datablock + 112));
-
-  m0 = BSWAP64(m0);
-  m1 = BSWAP64(m1);
-  m2 = BSWAP64(m2);
-  m3 = BSWAP64(m3);
-  m4 = BSWAP64(m4);
-  m5 = BSWAP64(m5);
-  m6 = BSWAP64(m6);
-  m7 = BSWAP64(m7);
-
-  row1l = state->h[0];
-  row1h = state->h[1];
-  row2l = state->h[2];
-  row2h = state->h[3];
-  row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
-  row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL);
-
-  row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
-  row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL);
-
-#ifdef AVOID_BRANCHING
-  do
-  {
-    const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt));
-    const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask);
-    const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask);
-    row4l = _mm_xor_si128(row4l, xor1);
-    row4h = _mm_xor_si128(row4h, xor2);
-  } while(0);
-#else
-  if(!state->nullt)
-  {
-  	row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
-  	row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1]));
-  }
-#endif
-
-  ROUND( 0);
-  ROUND( 1);
-  ROUND( 2);
-  ROUND( 3);
-  ROUND( 4);
-  ROUND( 5);
-  ROUND( 6);
-  ROUND( 7);
-  ROUND( 8);
-  ROUND( 9);
-  ROUND(10);
-  ROUND(11);
-  ROUND(12);
-  ROUND(13);
-  ROUND(14);
-  ROUND(15);
-
-  row1l = _mm_xor_si128(row3l,row1l);
-  row1h = _mm_xor_si128(row3h,row1h);
-
-  state->h[0] = _mm_xor_si128(row1l, state->h[0]);
-  state->h[1] = _mm_xor_si128(row1h, state->h[1]);
-
-  row2l = _mm_xor_si128(row4l,row2l);
-  row2h = _mm_xor_si128(row4h,row2h);
-
-  state->h[2] = _mm_xor_si128(row2l, state->h[2]);
-  state->h[3] = _mm_xor_si128(row2h, state->h[3]);
-  
-  return 0;
-}
-
-static inline void blake512_init( hashState_blake * S, u64 databitlen )
-{
-  memset(S, 0, sizeof(hashState_blake));
-  S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL);
-  S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL);
-  S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL);
-  S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL);
-  S->buflen = databitlen;
-}
-
-
-static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) 
-{
-
-
-  int left = (S->buflen >> 3); 
-  int fill = 128 - left;
-
-  if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) {
-    memcpy( (void *) (S->buf + left), (void *) data, fill );
-    S->t[0] += 1024;
-    blake512_compress( S, S->buf );
-    data += fill;
-    datalen  -= (fill << 3);       
-    left = 0;
-  }
-
-  while( datalen >= 1024 ) {  
-    S->t[0] += 1024;
-    blake512_compress( S, data );
-    data += 128;
-    datalen  -= 1024;
-  }
-
-  if( datalen > 0 ) {
-    memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F );
-    S->buflen = (left<<3) + datalen;
-  }
-  else S->buflen=0;
-}
-
-static inline void blake512_final( hashState_blake * S, u8 * digest ) 
-{
-
-  u8 msglen[16], zo=0x01,oo=0x81;
-  u64 lo=S->t[0] + S->buflen, hi = S->t[1];
-  if ( lo < S->buflen ) hi++;
-  U64TO8(  msglen + 0, hi );
-  U64TO8(  msglen + 8, lo );
-
-  if ( S->buflen == 888 ) /* one padding byte */
-  { 
-    S->t[0] -= 8; 
-    blake512_update( S, &oo, 8 );
-  }
-  else 
-  {
-    if ( S->buflen < 888 ) /* enough space to fill the block */
-    { 
-      if ( S->buflen == 0 ) S->nullt=1;
-      S->t[0] -= 888 - S->buflen;
-      blake512_update( S, padding, 888 - S->buflen );
-    }
-    else /* NOT enough space, need 2 compressions */ 
-    { 
-      S->t[0] -= 1024 - S->buflen; 
-      blake512_update( S, padding, 1024 - S->buflen );
-      S->t[0] -= 888;
-      blake512_update( S, padding+1, 888 );
-      S->nullt = 1;
-    }
-    blake512_update( S, &zo, 8 );
-    S->t[0] -= 8;
-  }
-  S->t[0] -= 128;
-  blake512_update( S, msglen, 128 );    
-
-  do
-  {
-    const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-    _mm_storeu_si128((__m128i*)(digest +  0), BSWAP64(S->h[0]));
-    _mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1]));
-    _mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2]));
-    _mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3]));
-  } while(0);
-}
-
-/*
-int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) 
-{
-
-  hashState_blake S;
-  blake512_init( &S );
-  blake512_update( &S, in, inlen*8 );
-  blake512_final( &S, out );
-  return 0;
-}
-*/
-/*
-#ifdef NOT_SUPERCOP
-
-int main() 
-{
-
-  int i, v;
-  u8 data[144], digest[64];
-  u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1, 
-	       0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4, 
-	       0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09, 
-	       0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3};
-  u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F, 
-	       0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F, 
-	       0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22, 
-	       0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE};
-
-  for(i=0; i<144; ++i) data[i]=0;  
-
-  crypto_hash( digest, data, 1 );    
-  v=0;
-  for(i=0; i<64; ++i) {
-    printf("%02X", digest[i]);
-    if ( digest[i] != test1[i]) v=1;
-  }
-  if (v) printf("\nerror\n");
-  else  printf("\nok\n");
-
-  for(i=0; i<144; ++i) data[i]=0;  
-
-  crypto_hash( digest, data, 144 );    
-  v=0;
-  for(i=0; i<64; ++i) {
-    printf("%02X", digest[i]);
-    if ( digest[i] != test2[i]) v=1;
-  }
-  if (v) printf("\nerror\n");
-  else printf("\nok\n");
-
-  return 0;
-}
-
-#endif
-
-*/
-
-
--- a/algo/blake/sse2/blake/sse41/hash.h
+++ b/algo/blake/sse2/blake/sse41/hash.h
@@ -1,74 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <x86intrin.h>
-
-#include "config.h"
-#include "rounds.h"
-/*
-#ifndef NOT_SUPERCOP
-
-#include "crypto_hash.h"
-#include "crypto_uint64.h"
-#include "crypto_uint32.h"
-#include "crypto_uint8.h"
-
-typedef crypto_uint64 u64;
-typedef crypto_uint32 u32;
-typedef crypto_uint8 u8; 
-
-#else
-*/
-typedef unsigned long long u64; 
-typedef unsigned int u32; 
-typedef unsigned char u8; 
-
-typedef struct  
-{ 
-	__m128i h[4];
-  u64 s[4], t[2];
-  u32 buflen, nullt;
-  u8 buf[128];
-} hashState_blake __attribute__ ((aligned (64)));
-/*
-#endif
-
-#define U8TO32(p) \
-  (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
-   ((u32)((p)[2]) <<  8) | ((u32)((p)[3])      ))
-#define U8TO64(p) \
-  (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
-#define U32TO8(p, v) \
-    (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
-    (p)[2] = (u8)((v) >>  8); (p)[3] = (u8)((v)      ); 
-#define U64TO8(p, v) \
-    U32TO8((p),     (u32)((v) >> 32));	\
-    U32TO8((p) + 4, (u32)((v)      )); 
-*/
-
-/*
-static const u8 padding[129] =
-{ 
-	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-};
-
-*/
-static inline void blake512_init( hashState_blake * S, u64 datalen );
-
-
-static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) ;
-
-static inline void blake512_final( hashState_blake * S, u8 * digest ) ;
-
-
-int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) ;
-
-
-
-
-
-
--- a/algo/blake/sse2/blake/sse41/implementors
+++ b/algo/blake/sse2/blake/sse41/implementors
@@ -1,2 +0,0 @@
-Jean-Philippe Aumasson
-Samuel Neves
--- a/algo/blake/sse2/blake/sse41/rounds.h
+++ b/algo/blake/sse2/blake/sse41/rounds.h
@@ -1,871 +0,0 @@
-
-#ifndef __BLAKE512_ROUNDS_H__
-#define __BLAKE512_ROUNDS_H__
-
-#ifndef HAVE_XOP
-	#define BSWAP64(x) _mm_shuffle_epi8((x), u8to64)
-
-	#define _mm_roti_epi64(x, c) \
-	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
-	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
-		: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c))) 
-#else
-	#define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64)
-#endif
-
-
-#define LOAD_MSG_0_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m0, m1); \
-t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m2, m3); \
-t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_0_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m0, m1); \
-t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m2, m3); \
-t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_0_3(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m4, m5); \
-t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m6, m7); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_0_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m4, m5); \
-t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m6, m7); \
-t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_1_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m7, m2); \
-t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m4, m6); \
-t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_1_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m5, m4); \
-t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m3, m7, 8); \
-t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_1_3(b0, b1) \
-do \
-{ \
-t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
-t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m5, m2); \
-t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_1_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m6, m1); \
-t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m3, m1); \
-t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_2_1(b0, b1) \
-do \
-{ \
-t0 = _mm_alignr_epi8(m6, m5, 8); \
-t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m2, m7); \
-t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_2_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m4, m0); \
-t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m1, m6, 0xF0); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_2_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m5, m1, 0xF0); \
-t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m3, m4); \
-t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_2_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m7, m3); \
-t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m2, m0, 8); \
-t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_3_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m3, m1); \
-t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m6, m5); \
-t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_3_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m4, m0); \
-t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m6, m7); \
-t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_3_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m1, m2, 0xF0); \
-t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m2, m7, 0xF0); \
-t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_3_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m3, m5); \
-t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m0, m4); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_4_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m4, m2); \
-t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m1, m5); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_4_2(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m0, m3, 0xF0); \
-t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m2, m7, 0xF0); \
-t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_4_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m7, m5, 0xF0); \
-t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m3, m1, 0xF0); \
-t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_4_4(b0, b1) \
-do \
-{ \
-t0 = _mm_alignr_epi8(m6, m0, 8); \
-t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m4, m6, 0xF0); \
-t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_5_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m1, m3); \
-t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m0, m4); \
-t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_5_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m6, m5); \
-t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m5, m1); \
-t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_5_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m2, m3, 0xF0); \
-t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m7, m0); \
-t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_5_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m6, m2); \
-t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m7, m4, 0xF0); \
-t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_6_1(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m6, m0, 0xF0); \
-t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m7, m2); \
-t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_6_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m2, m7); \
-t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m5, m6, 8); \
-t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_6_3(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m0, m3); \
-t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
-t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_6_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m3, m1); \
-t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m1, m5, 0xF0); \
-t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_7_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m6, m3); \
-t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m6, m1, 0xF0); \
-t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_7_2(b0, b1) \
-do \
-{ \
-t0 = _mm_alignr_epi8(m7, m5, 8); \
-t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m0, m4); \
-t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_7_3(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m2, m7); \
-t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m4, m1); \
-t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_7_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m0, m2); \
-t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m3, m5); \
-t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_8_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m3, m7); \
-t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m0, m5, 8); \
-t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_8_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m7, m4); \
-t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m4, m1, 8); \
-t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_8_3(b0, b1) \
-do \
-{ \
-t0 = m6; \
-t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m5, m0, 8); \
-t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_8_4(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m1, m3, 0xF0); \
-t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = m2; \
-t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_9_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m5, m4); \
-t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m3, m0); \
-t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_9_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m1, m2); \
-t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m3, m2, 0xF0); \
-t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_9_3(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m7, m4); \
-t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m1, m6); \
-t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_9_4(b0, b1) \
-do \
-{ \
-t0 = _mm_alignr_epi8(m7, m5, 8); \
-t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m6, m0); \
-t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_10_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m0, m1); \
-t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m2, m3); \
-t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_10_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m0, m1); \
-t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m2, m3); \
-t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_10_3(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m4, m5); \
-t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m6, m7); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_10_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m4, m5); \
-t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m6, m7); \
-t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_11_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m7, m2); \
-t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m4, m6); \
-t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_11_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m5, m4); \
-t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m3, m7, 8); \
-t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_11_3(b0, b1) \
-do \
-{ \
-t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
-t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m5, m2); \
-t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_11_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m6, m1); \
-t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m3, m1); \
-t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_12_1(b0, b1) \
-do \
-{ \
-t0 = _mm_alignr_epi8(m6, m5, 8); \
-t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m2, m7); \
-t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_12_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m4, m0); \
-t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m1, m6, 0xF0); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_12_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m5, m1, 0xF0); \
-t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m3, m4); \
-t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_12_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m7, m3); \
-t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_alignr_epi8(m2, m0, 8); \
-t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_13_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m3, m1); \
-t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m6, m5); \
-t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_13_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m4, m0); \
-t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m6, m7); \
-t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_13_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m1, m2, 0xF0); \
-t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m2, m7, 0xF0); \
-t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_13_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m3, m5); \
-t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m0, m4); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_14_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m4, m2); \
-t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m1, m5); \
-t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_14_2(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m0, m3, 0xF0); \
-t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m2, m7, 0xF0); \
-t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_14_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m7, m5, 0xF0); \
-t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m3, m1, 0xF0); \
-t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_14_4(b0, b1) \
-do \
-{ \
-t0 = _mm_alignr_epi8(m6, m0, 8); \
-t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m4, m6, 0xF0); \
-t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_15_1(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m1, m3); \
-t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpacklo_epi64(m0, m4); \
-t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_15_2(b0, b1) \
-do \
-{ \
-t0 = _mm_unpacklo_epi64(m6, m5); \
-t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m5, m1); \
-t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_15_3(b0, b1) \
-do \
-{ \
-t0 = _mm_blend_epi16(m2, m3, 0xF0); \
-t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_unpackhi_epi64(m7, m0); \
-t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-#define LOAD_MSG_15_4(b0, b1) \
-do \
-{ \
-t0 = _mm_unpackhi_epi64(m6, m2); \
-t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
-b0 = _mm_xor_si128(t0, t1); \
-t2 = _mm_blend_epi16(m7, m4, 0xF0); \
-t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
-b1 = _mm_xor_si128(t2, t3); \
-} while(0) 
-
-
-
-
-
-
-#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
-  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
-  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
-  \
-  row4l = _mm_xor_si128(row4l, row1l); \
-  row4h = _mm_xor_si128(row4h, row1h); \
-  \
-  row4l = _mm_roti_epi64(row4l, -32); \
-  row4h = _mm_roti_epi64(row4h, -32); \
-  \
-  row3l = _mm_add_epi64(row3l, row4l); \
-  row3h = _mm_add_epi64(row3h, row4h); \
-  \
-  row2l = _mm_xor_si128(row2l, row3l); \
-  row2h = _mm_xor_si128(row2h, row3h); \
-  \
-  row2l = _mm_roti_epi64(row2l, -25); \
-  row2h = _mm_roti_epi64(row2h, -25); \
-
-#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
-  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
-  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
-  \
-  row4l = _mm_xor_si128(row4l, row1l); \
-  row4h = _mm_xor_si128(row4h, row1h); \
-  \
-  row4l = _mm_roti_epi64(row4l, -16); \
-  row4h = _mm_roti_epi64(row4h, -16); \
-  \
-  row3l = _mm_add_epi64(row3l, row4l); \
-  row3h = _mm_add_epi64(row3h, row4h); \
-  \
-  row2l = _mm_xor_si128(row2l, row3l); \
-  row2h = _mm_xor_si128(row2h, row3h); \
-  \
-  row2l = _mm_roti_epi64(row2l, -11); \
-  row2h = _mm_roti_epi64(row2h, -11); \
-
-
-#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
-	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
-	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
-	row2l = t0; \
-	row2h = t1; \
-	\
-	t0 = row3l; \
-	row3l = row3h; \
-	row3h = t0;    \
-	\
-	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
-	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
-	row4l = t1; \
-	row4h = t0; 
-
-#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
-	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
-	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
-	row2l = t0; \
-	row2h = t1; \
-	\
-	t0 = row3l; \
-	row3l = row3h; \
-	row3h = t0; \
-	\
-	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
-	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
-	row4l = t1; \
-	row4h = t0; 
-
-#define ROUND(r) \
-  LOAD_MSG_ ##r ##_1(b0, b1); \
-  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
-  LOAD_MSG_ ##r ##_2(b0, b1); \
-  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
-  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
-  LOAD_MSG_ ##r ##_3(b0, b1); \
-  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
-  LOAD_MSG_ ##r ##_4(b0, b1); \
-  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
-  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
-
-#endif
-
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -40,8 +40,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      bmw512hash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
-//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -66,7 +65,7 @@ void bmw512hash_4way(void *state, const void *input)
 {
    bmw512_4way_context ctx;
    bmw512_4way_init( &ctx );
-    bmw512_4way( &ctx, input, 80 );
+    bmw512_4way_update( &ctx, input, 80 );
    bmw512_4way_close( &ctx, state );
 }

@@ -94,8 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      bmw512hash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
-//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
--- a/algo/bmw/sse2/bmw.c
+++ b/algo/bmw/sse2/bmw.c
@@ -1,519 +0,0 @@
-/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
-/*
- * BMW implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#include <stddef.h>
-#include <string.h>
-#include <limits.h>
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include "../sph_bmw.h"
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-static const sph_u64 bmwIV512[] = {
-	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
-	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
-	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
-	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
-	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
-	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
-	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
-	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
-};
-
-#define XCAT(x, y)    XCAT_(x, y)
-#define XCAT_(x, y)   x ## y
-
-#define LPAR   (
-
-#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
-#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
-#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
-#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
-#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
-#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
-#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
-#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
-#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
-#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
-#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
-#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
-#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
-#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
-#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
-#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
-
-#define M16_16    0,  1,  3,  4,  7, 10, 11
-#define M16_17    1,  2,  4,  5,  8, 11, 12
-#define M16_18    2,  3,  5,  6,  9, 12, 13
-#define M16_19    3,  4,  6,  7, 10, 13, 14
-#define M16_20    4,  5,  7,  8, 11, 14, 15
-#define M16_21    5,  6,  8,  9, 12, 15, 16
-#define M16_22    6,  7,  9, 10, 13,  0,  1
-#define M16_23    7,  8, 10, 11, 14,  1,  2
-#define M16_24    8,  9, 11, 12, 15,  2,  3
-#define M16_25    9, 10, 12, 13,  0,  3,  4
-#define M16_26   10, 11, 13, 14,  1,  4,  5
-#define M16_27   11, 12, 14, 15,  2,  5,  6
-#define M16_28   12, 13, 15, 16,  3,  6,  7
-#define M16_29   13, 14,  0,  1,  4,  7,  8
-#define M16_30   14, 15,  1,  2,  5,  8,  9
-#define M16_31   15, 16,  2,  3,  6,  9, 10
-
-#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
-                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
-#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
-                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
-#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
-                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
-#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
-                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
-#define ss4(x)    (((x) >> 1) ^ (x))
-#define ss5(x)    (((x) >> 2) ^ (x))
-#define rs1(x)    SPH_ROTL32(x,  3)
-#define rs2(x)    SPH_ROTL32(x,  7)
-#define rs3(x)    SPH_ROTL32(x, 13)
-#define rs4(x)    SPH_ROTL32(x, 16)
-#define rs5(x)    SPH_ROTL32(x, 19)
-#define rs6(x)    SPH_ROTL32(x, 23)
-#define rs7(x)    SPH_ROTL32(x, 27)
-
-#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
-
-#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
-	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
-		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
-
-#define expand1s_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
-		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
-		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
-		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
-		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-
-#define expand1s(qf, mf, hf, i16) \
-	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand1s_(qf, mf, hf, i16, ix, iy) \
-	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
-
-#define expand2s_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
-		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
-		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
-		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
-		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-
-#define expand2s(qf, mf, hf, i16) \
-	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand2s_(qf, mf, hf, i16, ix, iy) \
-	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
-
-#if SPH_64
-
-#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
-                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
-#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
-                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
-#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
-                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
-#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
-                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
-#define sb4(x)    (((x) >> 1) ^ (x))
-#define sb5(x)    (((x) >> 2) ^ (x))
-#define rb1(x)    SPH_ROTL64(x,  5)
-#define rb2(x)    SPH_ROTL64(x, 11)
-#define rb3(x)    SPH_ROTL64(x, 27)
-#define rb4(x)    SPH_ROTL64(x, 32)
-#define rb5(x)    SPH_ROTL64(x, 37)
-#define rb6(x)    SPH_ROTL64(x, 43)
-#define rb7(x)    SPH_ROTL64(x, 53)
-
-#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
-
-#if 0
-
-static const sph_u64 Kb_tab[] = {
-	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
-	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
-};
-
-#define rol_off(mf, j, off) \
-	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
-
-#define add_elt_b(mf, hf, j) \
-	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
-		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
-
-#define expand1b(qf, mf, hf, i) \
-	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
-		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
-		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
-		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
-		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
-		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
-		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
-		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
-		+ add_elt_b(mf, hf, (i) - 16))
-
-#define expand2b(qf, mf, hf, i) \
-	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
-		+ qf((i) - 14) + rb2(qf((i) - 13)) \
-		+ qf((i) - 12) + rb3(qf((i) - 11)) \
-		+ qf((i) - 10) + rb4(qf((i) - 9)) \
-		+ qf((i) - 8) + rb5(qf((i) - 7)) \
-		+ qf((i) - 6) + rb6(qf((i) - 5)) \
-		+ qf((i) - 4) + rb7(qf((i) - 3)) \
-		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
-		+ add_elt_b(mf, hf, (i) - 16))
-
-#else
-
-#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
-	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
-		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
-
-#define expand1b_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
-		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
-		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
-		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
-		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-
-#define expand1b(qf, mf, hf, i16) \
-	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand1b_(qf, mf, hf, i16, ix, iy) \
-	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
-
-#define expand2b_inner(qf, mf, hf, i16, \
-		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
-		i9, i10, i11, i12, i13, i14, i15, \
-		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
-	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
-		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
-		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
-		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
-		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
-
-#define expand2b(qf, mf, hf, i16) \
-	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
-#define expand2b_(qf, mf, hf, i16, ix, iy) \
-	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
-
-#endif
-
-#endif
-
-#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
-	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
-	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
-
-#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
-#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
-#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
-#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
-#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
-#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
-#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
-#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
-#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
-#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
-#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
-#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
-#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
-#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
-#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
-#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
-
-#define MAKE_Qas   do { \
-		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
-		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
-		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
-		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
-		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
-		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
-		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
-		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
-		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
-		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
-		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
-		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
-		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
-		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
-		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
-		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
-	} while (0)
-
-#define MAKE_Qbs   do { \
-		qt[16] = expand1s(Qs, M, H, 16); \
-		qt[17] = expand1s(Qs, M, H, 17); \
-		qt[18] = expand2s(Qs, M, H, 18); \
-		qt[19] = expand2s(Qs, M, H, 19); \
-		qt[20] = expand2s(Qs, M, H, 20); \
-		qt[21] = expand2s(Qs, M, H, 21); \
-		qt[22] = expand2s(Qs, M, H, 22); \
-		qt[23] = expand2s(Qs, M, H, 23); \
-		qt[24] = expand2s(Qs, M, H, 24); \
-		qt[25] = expand2s(Qs, M, H, 25); \
-		qt[26] = expand2s(Qs, M, H, 26); \
-		qt[27] = expand2s(Qs, M, H, 27); \
-		qt[28] = expand2s(Qs, M, H, 28); \
-		qt[29] = expand2s(Qs, M, H, 29); \
-		qt[30] = expand2s(Qs, M, H, 30); \
-		qt[31] = expand2s(Qs, M, H, 31); \
-	} while (0)
-
-#define MAKE_Qs   do { \
-		MAKE_Qas; \
-		MAKE_Qbs; \
-	} while (0)
-
-#define Qs(j)   (qt[j])
-
-#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
-#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
-#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
-#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
-#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
-#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
-#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
-#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
-#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
-#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
-#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
-#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
-#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
-#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
-#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
-#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
-
-#define MAKE_Qab   do { \
-		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
-		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
-		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
-		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
-		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
-		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
-		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
-		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
-		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
-		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
-		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
-		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
-		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
-		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
-		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
-		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
-	} while (0)
-
-#define MAKE_Qbb   do { \
-		qt[16] = expand1b(Qb, M, H, 16); \
-		qt[17] = expand1b(Qb, M, H, 17); \
-		qt[18] = expand2b(Qb, M, H, 18); \
-		qt[19] = expand2b(Qb, M, H, 19); \
-		qt[20] = expand2b(Qb, M, H, 20); \
-		qt[21] = expand2b(Qb, M, H, 21); \
-		qt[22] = expand2b(Qb, M, H, 22); \
-		qt[23] = expand2b(Qb, M, H, 23); \
-		qt[24] = expand2b(Qb, M, H, 24); \
-		qt[25] = expand2b(Qb, M, H, 25); \
-		qt[26] = expand2b(Qb, M, H, 26); \
-		qt[27] = expand2b(Qb, M, H, 27); \
-		qt[28] = expand2b(Qb, M, H, 28); \
-		qt[29] = expand2b(Qb, M, H, 29); \
-		qt[30] = expand2b(Qb, M, H, 30); \
-		qt[31] = expand2b(Qb, M, H, 31); \
-	} while (0)
-
-#define MAKE_Qb   do { \
-		MAKE_Qab; \
-		MAKE_Qbb; \
-	} while (0)
-
-#define Qb(j)   (qt[j])
-
-#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
-		type qt[32], xl, xh; \
-		mkQ; \
-		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
-			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
-		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
-			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
-		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
-			+ (xl ^ qf(24) ^ qf( 0))); \
-		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
-			+ (xl ^ qf(25) ^ qf( 1))); \
-		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
-			+ (xl ^ qf(26) ^ qf( 2))); \
-		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
-			+ (xl ^ qf(27) ^ qf( 3))); \
-		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
-			+ (xl ^ qf(28) ^ qf( 4))); \
-		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
-			+ (xl ^ qf(29) ^ qf( 5))); \
-		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
-			+ (xl ^ qf(30) ^ qf( 6))); \
-		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
-			+ (xl ^ qf(31) ^ qf( 7))); \
-		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
-			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
-		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
-			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
-		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
-			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
-		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
-			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
-		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
-			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
-		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
-			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
-		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
-			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
-		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
-			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
-	} while (0)
-
-#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
-
-#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
-
-#define DECL_BMW \
-    sph_u64 bmwH[16]; \
-
-/* load initial constants */
-#define BMW_I \
-do { \
-    memcpy(bmwH, bmwIV512, sizeof bmwH); \
-    hashptr = 0; \
-    hashctA = 0; \
-} while (0) 
-
-/* load hash for loop */
-#define BMW_U \
-do { \
-    const void *data = hash; \
-    size_t len = 64; \
-    unsigned char *buf; \
-    \
-    hashctA += (sph_u64)len << 3; \
-    buf = hashbuf; \
-    memcpy(buf, data, 64); \
-    hashptr = 64; \
-} while (0)  
-
-
-/* bmw512 hash loaded */
-/* hash = blake512(loaded) */
-#define BMW_C \
-do { \
-    void *dst = hash; \
-    size_t out_size_w64 = 8; \
-    unsigned char *data; \
-    sph_u64 *dh; \
-    unsigned char *out; \
-    size_t ptr, u, v; \
-    unsigned z; \
-    sph_u64 h1[16], h2[16], *h; \
-    data = hashbuf; \
-    ptr = hashptr; \
-    z = 0x80 >> 0; \
-    data[ptr ++] = ((0 & -z) | z) & 0xFF; \
-    memset(data + ptr, 0, (sizeof(char)*128) - 8 - ptr); \
-    sph_enc64le_aligned(data + (sizeof(char)*128) - 8, \
-    SPH_T64(hashctA + 0)); \
-    /* for break loop */ \
-    /* one copy of inline FOLD */ \
-    /* FOLD uses, */ \
-    /* uint64 *h, data */ \
-    /* uint64 dh, state */ \
-        h = bmwH; \
-        dh = h2; \
-    for (;;) { \
-        FOLDb; \
-        /* dh gets changed for 2nd run */ \
-        if (dh == h1) break; \
-        for (u = 0; u < 16; u ++) \
-        sph_enc64le_aligned(data + 8 * u, h2[u]); \
-        dh = h1; \
-        h = (sph_u64*)final_b; \
-    } \
-    /* end wrapped for break loop */ \
-    out = dst; \
-    for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) \
-    sph_enc64le(out + 8 * u, h1[v]); \
-} while (0) 
-
-/*
-static void
-compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
-{
-
-#define M(x)    sph_dec64le_aligned(data + 8 * (x))
-#define H(x)    (h[x])
-#define dH(x)   (dh[x])
-
-	FOLDb;
-
-#undef M
-#undef H
-#undef dH
-}
-*/
-
-static const sph_u64 final_b[16] = {
-	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
-	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
-	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
-	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
-	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
-	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
-	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
-	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
-};
-
-
-#ifdef __cplusplus
-}
-#endif
--- a/algo/bmw/sse2/sph_bmw.h
+++ b/algo/bmw/sse2/sph_bmw.h
@@ -1,61 +0,0 @@
-/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
- * functions which differ by their output size; this implementation
- * defines BMW for output sizes 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_bmw.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_BMW_H__
-#define SPH_BMW_H__
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#include "sph_types.h"
-
-#define SPH_SIZE_bmw512   512
-
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	sph_u64 bmwH[16];
-#endif
-} sph_bmw_big_context;
-
-typedef sph_bmw_big_context sph_bmw512_context;
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -168,6 +168,66 @@ int cube_4way_close( cube_4way_context *sp, void *output )
    return 0;
 }

+int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen, 
+                    const void *data, size_t size )
+{
+    __m512i *h = (__m512i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+    h[ 0] = m512_const1_128( iv[0] );
+    h[ 1] = m512_const1_128( iv[1] );
+    h[ 2] = m512_const1_128( iv[2] );
+    h[ 3] = m512_const1_128( iv[3] );
+    h[ 4] = m512_const1_128( iv[4] );
+    h[ 5] = m512_const1_128( iv[5] );
+    h[ 6] = m512_const1_128( iv[6] );
+    h[ 7] = m512_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m512i *in = (__m512i*)data;
+    __m512i *hash = (__m512i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_4way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
+                                    m512_const2_64( 0, 0x0000000000000080 ) );
+    transform_4way( sp );
+
+    sp->h[7] = _mm512_xor_si512( sp->h[7],
+                                    m512_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )
+       transform_4way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<6);
+    return 0;
+}
+
+
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                               const void *data, size_t size )
 {
@@ -376,4 +436,62 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
    return 0;
 }

+int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
+                               const void *data, size_t size )
+{
+    __m256i *h = (__m256i*)sp->h;
+    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
+                                                : (__m128i*)IV256 );
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = 32/16;
+    sp->rounds    = 16;
+    sp->pos       = 0;
+
+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
+    h[ 0] = m256_const1_128( iv[0] );
+    h[ 1] = m256_const1_128( iv[1] );
+    h[ 2] = m256_const1_128( iv[2] );
+    h[ 3] = m256_const1_128( iv[3] );
+    h[ 4] = m256_const1_128( iv[4] );
+    h[ 5] = m256_const1_128( iv[5] );
+    h[ 6] = m256_const1_128( iv[6] );
+    h[ 7] = m256_const1_128( iv[7] );
+
+    const int len = size >> 4;
+    const __m256i *in = (__m256i*)data;
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                                    m256_const2_64( 0, 0x0000000000000080 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7],
+                                    m256_const2_64( 0x0000000100000000, 0 ) );
+
+    for ( i = 0; i < 10; ++i )    transform_2way( sp );
+
+    memcpy( hash, sp->h, sp->hashlen<<5 );
+    return 0;
+}
+
 #endif
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -21,15 +21,12 @@ typedef struct _cube_4way_context cube_4way_context;

 int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
-// reinitialize context with same parameters, much faster.
-int cube_4way_reinit( cube_4way_context *sp );
-
 int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
-
 int cube_4way_close( cube_4way_context *sp, void *output );
-
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                            const void *data, size_t size );
+int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
+                    const void *data, size_t size );

 #endif

@@ -48,15 +45,12 @@ typedef struct _cube_2way_context cube_2way_context;

 int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
-// reinitialize context with same parameters, much faster.
-int cube_2way_reinit( cube_2way_context *sp );
-
 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
-
 int cube_2way_close( cube_2way_context *sp, void *output );
-
 int cube_2way_update_close( cube_2way_context *sp, void *output,
                            const void *data, size_t size );
+int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
+                    const void *data, size_t size );


 #endif
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -21,7 +21,27 @@ static void transform( cubehashParam *sp )
    int r;
    const int rounds = sp->rounds;

-#ifdef __AVX2__
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+    register __m512i x0, x1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->x     );
+    x1 = _mm512_load_si512( (__m512i*)sp->x + 1 );
+
+    for ( r = 0; r < rounds; ++r )
+    { 
+        x1 = _mm512_add_epi32( x0, x1 );
+        x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
+        x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
+        x0 = _mm512_xor_si512( mm512_rol_32(
+                                         mm512_swap256_128( x0 ), 11 ), x1 );
+        x1 = mm512_swap64_32( x1 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->x,     x0 );
+    _mm512_store_si512( (__m512i*)sp->x + 1, x1 );
+
+#elif defined(__AVX2__)

    register __m256i x0, x1, x2, x3, y0, y1;

--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -7,7 +7,6 @@
 * - implements NIST hash api
 * - assumes that message lenght is multiple of 8-bits
 * - _ECHO_VPERM_ must be defined if compiling with ../main.c
- * -  define NO_AES_NI for aes_ni version
 *
 * Cagdas Calik
 * ccalik@metu.edu.tr
@@ -21,13 +20,7 @@
 #include "hash_api.h"
 //#include "vperm.h"
 #include <immintrin.h>
-/*
-#ifndef NO_AES_NI
-#include <wmmintrin.h>
-#else
-#include <tmmintrin.h>
-#endif
-*/
+#include "simd-utils.h"

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -179,53 +172,53 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc

   for(b = 0; b < uBlockCount; b++)
   {
-	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
+   	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);

-	// load message
-	for(j = ctx->uHashSize / 256; j < 4; j++)
-	{
-	   for(i = 0; i < 4; i++)
+   	// load message
+	   for(j = ctx->uHashSize / 256; j < 4; j++)
 	   {
-		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+	      for(i = 0; i < 4; i++)
+	      {
+		     _state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+	      }
 	   }
-	}

-	// save state
-	SAVESTATE(_statebackup, _state);
+	   // save state
+	   SAVESTATE(_statebackup, _state);

-	k1 = ctx->k;
+	   k1 = ctx->k;

-	for(r = 0; r < ctx->uRounds / 2; r++)
-	{
-		ECHO_ROUND_UNROLL2;
-	}
+	   for(r = 0; r < ctx->uRounds / 2; r++)
+   	{
+	   	ECHO_ROUND_UNROLL2;
+	   }
 		
-	if(ctx->uHashSize == 256)
-	{
-	   for(i = 0; i < 4; i++)
+	   if(ctx->uHashSize == 256)
 	   {
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+	      for(i = 0; i < 4; i++)
+	      {
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+	      }
 	   }
-	}
-	else
-	{
-	   for(i = 0; i < 4; i++)
-	   {
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
-           }
-	}
-	pmsg += ctx->uBlockLength;
+	   else
+    	{
+	      for(i = 0; i < 4; i++)
+	      {
+      		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		      _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		      _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
+		      _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
+         }
+   	}
+	   pmsg += ctx->uBlockLength;
   }
 	SAVESTATE(ctx->state, _state);

@@ -390,13 +383,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	}

 	// Store the hash value
-	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
-	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
+	_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
+	_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);

 	if(state->uHashSize == 512)
 	{
-		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
-		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
+		_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
+		_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
 	}

 	return SUCCESS;
@@ -513,18 +506,177 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   }

   // Store the hash value
-   _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
+   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );

   if( state->uHashSize == 512 )
   {
-        _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
+        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );

   }
   return SUCCESS;
 }

+HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+            int nHashSize, const BitSequence *data, DataLength datalen )
+{
+   int i, j;
+
+   state->k = m128_zero;
+   state->processed_bits = 0;
+   state->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+      case 256:
+         state->uHashSize = 256;
+         state->uBlockLength = 192;
+         state->uRounds = 8;
+         state->hashsize = m128_const_64( 0, 0x100 );
+         state->const1536 = m128_const_64( 0, 0x600 );
+         break;
+
+      case 512:
+         state->uHashSize = 512;
+         state->uBlockLength = 128;
+         state->uRounds = 10;
+         state->hashsize = m128_const_64( 0, 0x200 );
+         state->const1536 = m128_const_64( 0, 0x400 );
+         break;
+
+      default:
+         return BAD_HASHBITLEN;
+   }
+
+   for(i = 0; i < 4; i++)
+      for(j = 0; j < nHashSize / 256; j++)
+         state->state[i][j] = state->hashsize;
+
+   for(i = 0; i < 4; i++)
+      for(j = nHashSize / 256; j < 4; j++)
+         state->state[i][j] = m128_zero;
+
+
+   unsigned int uBlockCount, uRemainingBytes;
+
+   if( (state->uBufferBytes + datalen) >= state->uBlockLength )
+   {
+        if( state->uBufferBytes != 0 )
+        {
+           // Fill the buffer
+           memcpy( state->buffer + state->uBufferBytes,
+                   (void*)data, state->uBlockLength - state->uBufferBytes );
+
+           // Process buffer
+           Compress( state, state->buffer, 1 );
+           state->processed_bits += state->uBlockLength * 8;
+
+           data += state->uBlockLength - state->uBufferBytes;
+           datalen -= state->uBlockLength - state->uBufferBytes;
+        }
+
+        // buffer now does not contain any unprocessed bytes
+
+        uBlockCount = datalen / state->uBlockLength;
+        uRemainingBytes = datalen % state->uBlockLength;
+
+        if( uBlockCount > 0 )
+        {
+           Compress( state, data, uBlockCount );
+           state->processed_bits += uBlockCount * state->uBlockLength * 8;
+           data += uBlockCount * state->uBlockLength;
+        }
+
+        if( uRemainingBytes > 0 )
+        memcpy(state->buffer, (void*)data, uRemainingBytes);
+
+        state->uBufferBytes = uRemainingBytes;
+   }
+   else
+   {
+        memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen );
+        state->uBufferBytes += datalen;
+   }
+
+   __m128i remainingbits;
+
+   // Add remaining bytes in the buffer
+   state->processed_bits += state->uBufferBytes * 8;
+
+   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+   // Pad with 0x80
+   state->buffer[state->uBufferBytes++] = 0x80;
+   // Enough buffer space for padding in this block?
+   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
+   {
+        // Pad with zeros
+        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+
+        // Last block contains message bits?
+        if( state->uBufferBytes == 1 )
+        {
+           state->k = _mm_xor_si128( state->k, state->k );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+        else
+        {
+           state->k = _mm_add_epi64( state->k, remainingbits );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+
+        // Compress
+        Compress( state, state->buffer, 1 );
+   }
+   else
+   {
+        // Fill with zero and compress
+        memset( state->buffer + state->uBufferBytes, 0,
+                state->uBlockLength - state->uBufferBytes );
+        state->k = _mm_add_epi64( state->k, remainingbits );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1 );
+
+        // Last block
+        memset( state->buffer, 0, state->uBlockLength - 18 );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
+                 state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+        // Compress the last block
+        state->k = _mm_xor_si128( state->k, state->k );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1) ;
+   }
+
+   // Store the hash value
+   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
+
+   if( state->uHashSize == 512 )
+   {
+        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
+
+   }
+   return SUCCESS;
+}
+
+

 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -15,7 +15,7 @@
 #ifndef HASH_API_H
 #define HASH_API_H

-#ifndef NO_AES_NI
+#ifdef __AES__
 #define HASH_IMPL_STR	"ECHO-aesni"
 #else
 #define HASH_IMPL_STR	"ECHO-vperm"
@@ -55,6 +55,8 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
                              const BitSequence *data, DataLength databitlen );
+HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
+            int nHashSize, const BitSequence *data, DataLength databitlen );

 #endif // HASH_API_H

--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -1,78 +1,37 @@
-#if defined(__AVX512VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//#if 0
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #include "simd-utils.h"
 #include "echo-hash-4way.h"

 /*
-#include <memory.h>
-#include "miner.h"
-#include "hash_api.h"
-//#include "vperm.h"
-#include <immintrin.h>
+static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
+{  
+   0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57,
+   0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
+};
 */
-/*
-#ifndef NO_AES_NI
-#include <wmmintrin.h>
-#else
-#include <tmmintrin.h>
-#endif
-*/
-
-// not used
-/*
-const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
-const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
-const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
-const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
-const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
-const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
-const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
-const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
-const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
-const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
-const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
-const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
-const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
-const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
-const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
-const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
-const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
-const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
-const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
-const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
-const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
-*/
-
-/*
-MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
-MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
-*/
-
-MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
-
 // do these need to be reversed?

 #define mul2mask \
-   m512_const4_32( 0x00001b00, 0, 0, 0 ) 
+   _mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
+//   _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  

 #define lsbmask    m512_const1_32( 0x01010101 ) 

 #define ECHO_SUBBYTES( state, i, j ) \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
-	k1 = _mm512_add_epi32( k1, m512_one_32 )
+	k1 = _mm512_add_epi32( k1, m512_one_128 );

 #define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
 { \
-   const int j1 = ( j+1 ) & 3; \
-   const int j2 = ( j+2 ) & 3; \
-   const int j3 = ( j+3 ) & 3; \
+   const int j1 = ( (j)+1 ) & 3; \
+   const int j2 = ( (j)+2 ) & 3; \
+   const int j3 = ( (j)+3 ) & 3; \
   s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
 	t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
-	t1 = _mm512_and_si128( t1, lsbmask );\
+	t1 = _mm512_and_si512( t1, lsbmask );\
 	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
 	s2 = _mm512_xor_si512( s2, t2 ); \
 	state2[ 0 ] [j ] = s2; \
@@ -97,7 +56,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
 	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512128( state2[ 2 ][ j ], s2 ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
 	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
 	s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
 	t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
@@ -108,12 +67,12 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
 	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 )
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
 } while(0)

 #define ECHO_ROUND_UNROLL2 \
 	ECHO_SUBBYTES(_state, 0, 0);\
-	ECHO_SUBBYTES(_state, 1, 0);\
+   ECHO_SUBBYTES(_state, 1, 0);\
 	ECHO_SUBBYTES(_state, 2, 0);\
 	ECHO_SUBBYTES(_state, 3, 0);\
 	ECHO_SUBBYTES(_state, 0, 1);\
@@ -153,8 +112,6 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)

-
-
 #define SAVESTATE(dst, src)\
 	dst[0][0] = src[0][0];\
 	dst[0][1] = src[0][1];\
@@ -173,33 +130,44 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	dst[3][2] = src[3][2];\
 	dst[3][3] = src[3][3]

-
-void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,
+// blockcount always 1
+void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
               unsigned int uBlockCount )
 {
  unsigned int r, b, i, j;
  __m512i t1, t2, s2, k1;
  __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; 

-// unroll   
-  for ( i = 0; i < 4; i++ )
-  for ( j = 0; j < ctx->uHashSize / 256; j++ )
-	 _state[ i ][ j ] = ctx->state[ i ][ j ];
+  _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
+  _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
+  _state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
+  _state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
+  _state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
+  _state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
+  _state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
+  _state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
+  _state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
+  _state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
+  _state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
+  _state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
+  _state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
+  _state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
+  _state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
+  _state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];

  for ( b = 0; b < uBlockCount; b++ )
  {
    ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );

-    // load message, make aligned, remove loadu
    for( j = ctx->uHashSize / 256; j < 4; j++ )
    {
      for ( i = 0; i < 4; i++ )
 	   {
-        _state[ i ][ j ] = _mm512_loadu_si512( 
-                     (__m512i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
+        _state[ i ][ j ] = _mm512_load_si512( 
+                     pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
 	   }
 	 }
-
+    
    // save state
 	 SAVESTATE( _statebackup, _state );

@@ -254,8 +222,6 @@ void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,

 }

-
-
 int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 {
 	int i, j;
@@ -270,23 +236,22 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 			ctx->uHashSize = 256;
 			ctx->uBlockLength = 192;
 			ctx->uRounds = 8;
-			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x100 );
-			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x600 );
+			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
 			break;

 		case 512:
 			ctx->uHashSize = 512;
 			ctx->uBlockLength = 128;
 			ctx->uRounds = 10;
-			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x200 );
-			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x400);
+			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
 			break;

 		default:
-			return BAD_HASHBITLEN;
+			return 1;
 	}

-
 	for( i = 0; i < 4; i++ )
 		for( j = 0; j < nHashSize / 256; j++ )
 			ctx->state[ i ][ j ] = ctx->hashsize;
@@ -295,265 +260,145 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 		for( j = nHashSize / 256; j < 4; j++ )
 			ctx->state[ i ][ j ] = m512_zero;

-	return SUCCESS;
-}
-
-int echo_4way_update( echo_4way_context *state, const BitSequence *data, DataLength databitlen )
-{
-	unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-	uByteLength = (unsigned int)(databitlen / 8);
-
-	if ( ( state->uBufferBytes + uByteLength ) >= state->uBlockLength )
-	{
-		if ( state->uBufferBytes != 0 )
-		{
-			// Fill the buffer
-			memcpy( state->buffer + state->uBufferBytes,
-               (void*)data, state->uBlockLength - state->uBufferBytes );
-
-			// Process buffer
-			echo_4way_compress( state, state->buffer, 1 );
-			state->processed_bits += state->uBlockLength * 8;
-
-			data += state->uBlockLength - state->uBufferBytes;
-			uByteLength -= state->uBlockLength - state->uBufferBytes;
-		}
-
-		// buffer now does not contain any unprocessed bytes
-
-		uBlockCount = uByteLength / state->uBlockLength;
-		uRemainingBytes = uByteLength % state->uBlockLength;
-
-		if ( uBlockCount > 0 )
-		{
-			echo_4way_compress( state, data, uBlockCount );
-
-			state->processed_bits += uBlockCount * state->uBlockLength * 8;
-			data += uBlockCount * state->uBlockLength;
-		}
-
-		if ( uRemainingBytes > 0 )
-		{
-			memcpy( state->buffer, (void*)data, uRemainingBytes );
-		}
-
-		state->uBufferBytes = uRemainingBytes;
-	}
-	else
-	{
-		memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
-		state->uBufferBytes += uByteLength;
-	}
-
 	return 0;
 }

-echo_4way_close( echo_4way_context *state, BitSequence *hashval )
+int echo_4way_update_close( echo_4way_context *state, void *hashval,
+                              const void *data, int databitlen )
 {
-	__m512i remainingbits;
+// bytelen is either 32 (maybe), 64 or 80 or 128!
+// all are less than full block.

-	// Add remaining bytes in the buffer
-	state->processed_bits += state->uBufferBytes * 8;
+   int vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+   const int vblen = state->uBlockLength / 16; //  16 bytes per lane
+   __m512i remainingbits;

-	remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
-
-	// Pad with 0x80
-	state->buffer[ state->uBufferBytes++ ] = 0x80;
-	
-	// Enough buffer space for padding in this block?
-	if ( ( state->uBlockLength - state->uBufferBytes ) >= 18)
-	{
-		// Pad with zeros
-		memset( state->buffer + state->uBufferBytes, 0,
-                         state->uBlockLength - ( state->uBufferBytes + 18 ) );
-
-		// Hash size
-		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
-           = state->uHashSize;
-
-		// Processed bits
-		*( ( DataLength*)( state->buffer + state->uBlockLength - 16 ) )
-           = state->processed_bits;
-		*( ( DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
-
-		// Last block contains message bits?
-		if ( state->uBufferBytes == 1 )
-		{
-			state->k = _mm512_xor_si512( state->k, state->k );
-			state->k = _mm512_sub_epi64( state->k, state->const1536 );
-		}
-		else
-		{
-			state->k = _mm512_add_epi64( state->k, remainingbits );
-			state->k = _mm512_sub_epi64( state->k, state->const1536 );
-		}
-
-		// Compress
-		echo_4way_compress( state, state->buffer, 1 );
-	}
-	else
-	{
-		// Fill with zero and compress
-		memset( state->buffer + state->uBufferBytes, 0,
-                        state->uBlockLength - state->uBufferBytes );
-		state->k = _mm512_add_epi64( state->k, remainingbits );
-		state->k = _mm512_sub_epi64( state->k, state->const1536 );
-		echo_4way_compress( state, state->buffer, 1 );
-
-		// Last block
-		memset( state->buffer, 0, state->uBlockLength - 18 );
-
-		// Hash size
-		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
-            = state->uHashSize;
-
-		// Processed bits
-		*( (DataLength*)( state->buffer + state->uBlockLength - 16 ) )
-            = state->processed_bits;
-		*( (DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
-
-		// Compress the last block
-		state->k = _mm512_xor_si512(state->k, state->k);
-		state->k = _mm512_sub_epi64(state->k, state->const1536);
-		echo_4way_compress(state, state->buffer, 1);
-	}
-
-	// Store the hash value
-	_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0][ 0 ]);
-	_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1][ 0 ]);
-
-	if ( state->uHashSize == 512 )
-	{
-		_mm512_storeu_si512((__m512i*)hashval + 2, state->state[ 2 ][ 0 ]);
-		_mm512_storeu_si512((__m512i*)hashval + 3, state->state[ 3 ][ 0 ]);
-	}
-
-	return 0;
-}
-
-int echo_4way_update_close( echo_4way_context *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen )
-{
-  unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-  uByteLength = (unsigned int)(databitlen / 8);
-
-  if ( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
-  {
-     if ( state->uBufferBytes != 0 )
-     {
-        // Fill the buffer
-        memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
-
-        // Process buffer
-        echo_4way_compress( state, state->buffer, 1 );
-        state->processed_bits += state->uBlockLength * 8;
-
-        data += state->uBlockLength - state->uBufferBytes;
-        uByteLength -= state->uBlockLength - state->uBufferBytes;
-     }
-
-     // buffer now does not contain any unprocessed bytes
-
-     uBlockCount = uByteLength / state->uBlockLength;
-     uRemainingBytes = uByteLength % state->uBlockLength;
-
-     if ( uBlockCount > 0 )
-     {
-        echo_4way_compress( state, data, uBlockCount );
-        state->processed_bits += uBlockCount * state->uBlockLength * 8;
-        data += uBlockCount * state->uBlockLength;
-     }
-
-     if ( uRemainingBytes > 0 )
-     memcpy(state->buffer, (void*)data, uRemainingBytes);
-     state->uBufferBytes = uRemainingBytes;
-  }
-  else
-  {
-     memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
-     state->uBufferBytes += uByteLength;
-  } 
-
-  __m512i remainingbits;
-
-  // Add remaining bytes in the buffer
-  state->processed_bits += state->uBufferBytes * 8;
-
-  remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
-
-  // Pad with 0x80
-  state->buffer[ state->uBufferBytes++ ] = 0x80;
-  // Enough buffer space for padding in this block?
-  if ( (state->uBlockLength - state->uBufferBytes) >= 18 )
+   if ( databitlen == 1024 )
   {
-     // Pad with zeros
-     memset( state->buffer + state->uBufferBytes, 0,i
-                        state->uBlockLength - (state->uBufferBytes + 18) );
+      echo_4way_compress( state, data, 1 );
+      state->processed_bits = 1024;
+      remainingbits = m512_const2_64( 0, -1024 );
+      vlen = 0;
+   }
+   else
+   {
+      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+      memcpy_512( state->buffer, data, vlen );
+      state->processed_bits += (unsigned int)( databitlen );
+      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );

-     // Hash size
-     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) )
-                   = state->uHashSize;
+   }

-     // Processed bits
-     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+   state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
+   state->buffer[ vblen-2 ] =
+                _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
+   state->buffer[ vblen-1 ] =
+                   _mm512_set4_epi64( 0, state->processed_bits,
+                                      0, state->processed_bits );  

-     // Last block contains message bits?
-     if( state->uBufferBytes == 1 )
-     {
-        state->k = _mm512_xor_si512( state->k, state->k );
-        state->k = _mm512_sub_epi64( state->k, state->const1536 );
-     }
-     else
-     {
-        state->k = _mm_add_epi64( state->k, remainingbits );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-     }
+   state->k = _mm512_add_epi64( state->k, remainingbits );
+   state->k = _mm512_sub_epi64( state->k, state->const1536 );

-     // Compress
-     echo_4way_compress( state, state->buffer, 1 );
-  }
-  else
-  {
-     // Fill with zero and compress
-     memset( state->buffer + state->uBufferBytes, 0,
-                state->uBlockLength - state->uBufferBytes );
-     state->k = _mm512_add_epi64( state->k, remainingbits );
-     state->k = _mm512_sub_epi64( state->k, state->const1536 );
-     echo_4way_compress( state, state->buffer, 1 );
+   echo_4way_compress( state, state->buffer, 1 );

-     // Last block
-     memset( state->buffer, 0, state->uBlockLength - 18 );
+   _mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
+   _mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );

-     // Hash size
-     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
-                 state->uHashSize;
-
-     // Processed bits
-     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                  state->processed_bits;
-     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-     // Compress the last block
-     state->k = _mm512_xor_si512( state->k, state->k );
-     state->k = _mm512_sub_epi64( state->k, state->const1536 );
-     echo_4way_compress( state, state->buffer, 1) ;
-  }
-
-  // Store the hash value
-  _mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
-  _mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
-
-  if ( state->uHashSize == 512 )
-  {
-     _mm512_storeu_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
-     _mm512_storeu_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
-
-  }
-  return 0;
+   if ( state->uHashSize == 512 )
+   {
+      _mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
+      _mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
+   }
+   return 0;
 }

+int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, 
+                    const void *data, int datalen )
+{
+   int i, j;
+   int databitlen = datalen * 8;
+   ctx->k = m512_zero;
+   ctx->processed_bits = 0;
+   ctx->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+      case 256:
+         ctx->uHashSize = 256;
+         ctx->uBlockLength = 192;
+         ctx->uRounds = 8;
+         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
+         break;
+
+      case 512:
+         ctx->uHashSize = 512;
+         ctx->uBlockLength = 128;
+         ctx->uRounds = 10;
+         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
+         break;
+
+      default:
+         return 1;
+   }
+
+   for( i = 0; i < 4; i++ )
+      for( j = 0; j < nHashSize / 256; j++ )
+         ctx->state[ i ][ j ] = ctx->hashsize;
+
+   for( i = 0; i < 4; i++ )
+      for( j = nHashSize / 256; j < 4; j++ )
+         ctx->state[ i ][ j ] = m512_zero;
+
+   
+// bytelen is either 32 (maybe), 64 or 80 or 128!
+// all are less than full block.
+
+   int vlen = datalen / 32;  
+   const int vblen = ctx->uBlockLength / 16; //  16 bytes per lane
+   __m512i remainingbits;
+
+   if ( databitlen == 1024 )
+   {
+      echo_4way_compress( ctx, data, 1 );
+      ctx->processed_bits = 1024;
+      remainingbits = m512_const2_64( 0, -1024 );
+      vlen = 0;
+   }
+   else
+   {
+      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+      memcpy_512( ctx->buffer, data, vlen );
+      ctx->processed_bits += (unsigned int)( databitlen );
+      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
+
+   }
+
+   ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
+   ctx->buffer[ vblen-2 ] =
+                _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
+   ctx->buffer[ vblen-1 ] =
+                   _mm512_set4_epi64( 0, ctx->processed_bits,
+                                      0, ctx->processed_bits );
+
+   ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
+   ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
+
+   echo_4way_compress( ctx, ctx->buffer, 1 );
+
+   _mm512_store_si512( (__m512i*)hashval + 0, ctx->state[ 0 ][ 0] );
+   _mm512_store_si512( (__m512i*)hashval + 1, ctx->state[ 1 ][ 0] );
+
+   if ( ctx->uHashSize == 512 )
+   {
+      _mm512_store_si512( (__m512i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
+      _mm512_store_si512( (__m512i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
+   }
+   return 0;
+}
+
+
 #endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -32,5 +32,8 @@ int echo_close( echo_4way_context *state, void *hashval );
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                              const void *data, int databitlen );

+int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
+                    const void *data, int datalen );
+
 #endif 
 #endif
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
@@ -4,7 +4,7 @@
 #include <stdlib.h>
 #include <memory.h>
 #include <math.h>
-
+#include "simd-utils.h"
 #include "sph_gost.h"

 #ifdef __cplusplus
@@ -696,9 +696,26 @@ static void AddModulo512(const void *a,const void *b,void *c)

 static void AddXor512(const void *a,const void *b,void *c)
 {
-	const unsigned long long *A=a, *B=b;
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
+                                           casti_m512i( b, 0 ) );
+#elif defined(__AVX2__)
+   casti_m256i( c, 0 ) = _mm256_xor_si256( casti_m256i( a, 0 ),
+                                           casti_m256i( b, 0 ) );
+   casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
+                                           casti_m256i( b, 1 ) );
+#elif defined(__SSE2__)
+   casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
+                                        casti_m128i( b, 0 ) );
+   casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
+                                        casti_m128i( b, 1 ) );
+   casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
+                                        casti_m128i( b, 2 ) );
+   casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
+                                        casti_m128i( b, 3 ) );
+#else
+   const unsigned long long *A=a, *B=b;
 	unsigned long long *C=c;
-#ifdef FULL_UNROLL
 	C[0] = A[0] ^ B[0];
 	C[1] = A[1] ^ B[1];
 	C[2] = A[2] ^ B[2];
@@ -707,12 +724,6 @@ static void AddXor512(const void *a,const void *b,void *c)
 	C[5] = A[5] ^ B[5];
 	C[6] = A[6] ^ B[6];
 	C[7] = A[7] ^ B[7];
-#else
-	int i = 0;
-
-	for(i=0; i<8; i++) {
-		C[i] = A[i] ^ B[i];
-	}
 #endif
 }

@@ -893,31 +904,32 @@ static void g_N(const unsigned char *N,unsigned char *h,const unsigned char *m)

 static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long long length,unsigned char *out)
 {
-	unsigned char v512[64] = {
+	unsigned char v512[64] __attribute__((aligned(64))) = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00
-	};
-	unsigned char v0[64] = {
+   };
+	unsigned char v0[64]  __attribute__((aligned(64))) = {
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+   };
+	unsigned char Sigma[64] __attribute__((aligned(64))) = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 	};
-	unsigned char Sigma[64] = {
+	unsigned char N[64] __attribute__((aligned(64))) = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 	};
-	unsigned char N[64] = {
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
-	};
-	unsigned char m[64], *hash = IV;
+	unsigned char m[64] __attribute__((aligned(64)));
+   unsigned char *hash = IV;
 	unsigned long long len = length;

 	// Stage 2
@@ -952,7 +964,7 @@ static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long

 static void hash_512(const unsigned char *message, unsigned long long length, unsigned char *out)
 {
-	unsigned char IV[64] = {
+	unsigned char IV[64] __attribute__((aligned(64))) = {
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
--- a/algo/gost/sph_gost.h
+++ b/algo/gost/sph_gost.h
@@ -81,9 +81,9 @@ typedef struct {
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
+	unsigned char buf[64] __attribute__((aligned(64))); 
+   sph_u32 V[5][8] __attribute__((aligned(64)));
 	size_t ptr;
-	sph_u32 V[5][8];
 #endif
 } sph_gost512_context;

--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -73,7 +73,7 @@ __m128i ALL_FF;
  b5 = a7;\
  a6 = _mm_xor_si128(a6, a7);\
  a7 = _mm_xor_si128(a7, b6);\
-  \
+   \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
  b0 = _mm_xor_si128(b0, a4);\
  b6 = _mm_xor_si128(b6, a4);\
@@ -195,7 +195,7 @@ __m128i ALL_FF;
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
+     /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
@@ -209,7 +209,6 @@ __m128i ALL_FF;
    \
    /* AddRoundConstant P1024 */\
    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
@@ -218,7 +217,6 @@ __m128i ALL_FF;
    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
-    /* SubBytes + MixBytes */\
    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -2,13 +2,6 @@
 //#define TASM
 #define TINTR

-//#define AES_NI
-
-//#ifdef AES_NI
-// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
-
-//#ifndef NO_AES_NI
-
 // Not to be confused with AVX512VAES
 #define VAES
 // #define VAVX
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -14,7 +14,7 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#ifdef __AES__

 #include "groestl-version.h"

@@ -67,8 +67,12 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT(ctx->chaining);
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT(ctx->chaining);
+
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -87,8 +91,9 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT(ctx->chaining);
+  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -180,6 +185,82 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   return SUCCESS_GR;
 }

+int groestl512_full( hashState_groestl* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+
+  int i;
+
+  ctx->hashlen = 64;
+  SET_CONSTANTS();
+
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   uint64_t blocks = len / SIZE512;
+   __m128i* in = (__m128i*)input;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == len -1 )
+   {
+       // only 128 bits left in buffer, all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                      0,0,0,0, 0,0,0,0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
+                                           0,         0 ,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF1024( ctx->chaining, ctx->buffer );
+
+   OF1024( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+   
+
 HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
                                const void* input, DataLength_gr databitlen )
 {
@@ -230,6 +311,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,5 +87,6 @@ HashReturn_gr final_groestl( hashState_groestl*, void* );

 HashReturn_gr update_and_final_groestl( hashState_groestl*,  void*,
                                        const void*, DataLength_gr );
+int groestl512_full( hashState_groestl*,  void*, const void*, uint64_t );

 #endif /* __hash_h */
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -11,7 +11,7 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#ifdef __AES__

 #include "groestl-version.h"

@@ -86,8 +86,11 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256(ctx->chaining);
+
+  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
+
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT256(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -93,9 +93,6 @@ typedef enum
 typedef struct {
  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
-//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
-//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
-//  u64 block_counter;        /* message block counter */
  int hashlen;              // bytes
  int blk_count;
  int buf_ptr;              /* data buffer pointer */
--- a/algo/groestl/groestl-4way.c
+++ b/algo/groestl/groestl-4way.c
@@ -0,0 +1,64 @@
+#include "groestl-gate.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(GROESTL_4WAY_VAES)
+
+#include "groestl512-hash-4way.h"
+
+void groestl_4way_hash( void *output, const void *input )
+{
+     uint32_t hash[16*4] __attribute__ ((aligned (128)));
+     groestl512_4way_context ctx;
+
+     groestl512_4way_init( &ctx, 64 );
+     groestl512_4way_update_close( &ctx, hash, input, 640 );
+
+     groestl512_4way_init( &ctx, 64 );
+     groestl512_4way_update_close( &ctx, hash, hash, 512 );
+
+     dintrlv_4x128( output, output+32, output+64, output+96, hash, 256 );
+ }
+
+int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*4] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 4;
+     uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+     int thr_id = mythr->id;
+     const uint32_t Htarg = ptarget[7];
+
+     mm512_bswap32_intrlv80_4x128( vdata, pdata );
+
+     do
+     {
+        be32enc( noncep,    n   );
+        be32enc( noncep+ 4, n+1 );
+        be32enc( noncep+ 8, n+2 );
+        be32enc( noncep+12, n+3 );
+
+        groestl_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( ( hash+(lane<<3) )[7] <= Htarg )
+        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        }
+        n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#endif
--- a/algo/groestl/groestl-gate.c
+++ b/algo/groestl/groestl-gate.c
@@ -0,0 +1,23 @@
+#include "groestl-gate.h"
+
+bool register_dmd_gr_algo( algo_gate_t *gate )
+{
+#if defined (GROESTL_4WAY_VAES)
+  gate->scanhash  = (void*)&scanhash_groestl_4way;
+  gate->hash      = (void*)&groestl_4way_hash;
+#else
+  init_groestl_ctx();
+  gate->scanhash  = (void*)&scanhash_groestl;
+  gate->hash      = (void*)&groestlhash;
+#endif
+  gate->optimizations = AES_OPT | VAES_OPT;
+  return true;
+};
+
+bool register_groestl_algo( algo_gate_t* gate )
+{
+    register_dmd_gr_algo( gate );
+    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+    return true;
+};
+
--- a/algo/groestl/groestl-gate.h
+++ b/algo/groestl/groestl-gate.h
@@ -0,0 +1,31 @@
+#ifndef GROESTL_GATE_H__
+#define GROESTL_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define GROESTL_4WAY_VAES 1
+#endif
+
+bool register_dmd_gr_algo( algo_gate_t* gate );
+
+bool register_groestl_algo( algo_gate_t* gate );
+
+#if defined(GROESTL_4WAY_VAES)
+
+void groestl_4way_hash( void *state, const void *input );
+int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
+void groestlhash( void *state, const void *input );
+int scanhash_groestl( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+void init_groestl_ctx();
+
+#endif
+
+#endif
+
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,22 +1,20 @@
-#include "algo-gate-api.h"
-
+#include "groestl-gate.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
-#ifdef NO_AES_NI
-  #include "sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "sph_groestl.h"
 #endif

 typedef struct
 {
-#ifdef NO_AES_NI
-    sph_groestl512_context groestl1, groestl2;
-#else
+#ifdef __AES__
    hashState_groestl groestl1, groestl2;
+#else
+    sph_groestl512_context groestl1, groestl2;
 #endif

 } groestl_ctx_holder;
@@ -25,12 +23,12 @@ static groestl_ctx_holder groestl_ctx;

 void init_groestl_ctx()
 {
-#ifdef NO_AES_NI
-    sph_groestl512_init( &groestl_ctx.groestl1 );
-    sph_groestl512_init( &groestl_ctx.groestl2 );
-#else
+#ifdef __AES__
    init_groestl( &groestl_ctx.groestl1, 64 );
    init_groestl( &groestl_ctx.groestl2, 64 );
+#else
+    sph_groestl512_init( &groestl_ctx.groestl1 );
+    sph_groestl512_init( &groestl_ctx.groestl2 );
 #endif
 }

@@ -40,18 +38,18 @@ void groestlhash( void *output, const void *input )
     groestl_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &groestl_ctx, sizeof(groestl_ctx) );

-#ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl1, input, 80);
-     sph_groestl512_close(&ctx.groestl1, hash);
-
-     sph_groestl512(&ctx.groestl2, hash, 64);
-     sph_groestl512_close(&ctx.groestl2, hash);
-#else
+#ifdef __AES__
     update_and_final_groestl( &ctx.groestl1, (char*)hash,
                               (const char*)input, 640 );

     update_and_final_groestl( &ctx.groestl2, (char*)hash,
                               (const char*)hash, 512 );
+#else
+     sph_groestl512(&ctx.groestl1, input, 80);
+     sph_groestl512_close(&ctx.groestl1, hash);
+
+     sph_groestl512(&ctx.groestl2, hash, 64);
+     sph_groestl512_close(&ctx.groestl2, hash);
 #endif
     memcpy(output, hash, 32);
 }
@@ -78,15 +76,12 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 		groestlhash(hash, endiandata);

 		if (hash[7] <= Htarg )
-                   if ( fulltest(hash, ptarget))
-                   {
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-	           }
-         
+         submit_solution( work, hash, mythr );
+	   }
 		nonce++;
-
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);

 	pdata[19] = nonce;
@@ -94,20 +89,3 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_dmd_gr_algo( algo_gate_t* gate )
-{
-    init_groestl_ctx();
-    gate->optimizations   = SSE2_OPT | AES_OPT;
-    gate->scanhash        = (void*)&scanhash_groestl;
-    gate->hash            = (void*)&groestlhash;
-    opt_target_factor = 256.0;
-    return true;
-};
-
-bool register_groestl_algo( algo_gate_t* gate )
-{
-    register_dmd_gr_algo( gate );
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-    return true;
-};
-
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -0,0 +1,109 @@
+/* hash.c     Aug 2011
+ * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt  2019-12.
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+// Optimized for hash and data length that are integrals of __m128i 
+
+
+#include <memory.h>
+#include "groestl256-intr-4way.h"
+#include "miner.h"
+#include "simd-utils.h"
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+
+int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
+{
+  int i;
+
+  ctx->hashlen = hashlen;
+  SET_CONSTANTS();
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
+  }
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
+//  uint64_t len = U64BIG((uint64_t)LENGTH);
+//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
+//  INIT256_4way(ctx->chaining);
+
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return 0;
+}
+
+int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE256 - 1 )
+   {        
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+   }   
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+   }
+
+// digest final padding block and do output transform
+   TF512_4way( ctx->chaining, ctx->buffer );
+
+   OF512_4way( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+#endif   // VAES
+
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -0,0 +1,75 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#if !defined(GROESTL256_HASH_4WAY_H__)
+#define GROESTL256_HASH_4WAY_H__ 1
+
+#include "simd-utils.h"
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+   
+#define LENGTH (256)
+
+//#include "brg_endian.h"
+//#define NEED_UINT_64T
+//#include "algo/sha/brg_types.h"
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+#define COLS512 (8)
+//#define COLS1024 (16)
+#define SIZE_512 ((ROWS)*(COLS512))
+//#define SIZE_1024 ((ROWS)*(COLS1024))
+#define ROUNDS512 (10)
+//#define ROUNDS1024 (14)
+
+//#if LENGTH<=256
+#define COLS (COLS512)
+#define SIZE (SIZE512)
+#define ROUNDS (ROUNDS512)
+//#else
+//#define COLS (COLS1024)
+//#define SIZE (SIZE1024)
+//#define ROUNDS (ROUNDS1024)
+//#endif
+
+#define SIZE256 (SIZE_512/16)
+
+typedef struct {
+  __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
+  __attribute__ ((aligned (64))) __m512i buffer[SIZE256];
+  int hashlen;       // byte
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
+  int rem_ptr;
+  int databitlen;    // bits
+} groestl256_4way_context;
+
+
+int groestl256_4way_init( groestl256_4way_context*, uint64_t );
+
+//int reinit_groestl( hashState_groestl* );
+
+//int groestl512_4way_update( groestl256_4way_context*, const void*,
+//                              uint64_t );
+
+//int groestl512_4way_close( groestl512_4way_context*, void* );
+
+int groestl256_4way_update_close( groestl256_4way_context*,  void*,
+                                        const void*, uint64_t );
+
+#endif
+#endif 
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -0,0 +1,526 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+
+#if !defined(GROESTL256_INTR_4WAY_H__)
+#define GROESTL256_INTR_4WAY_H__ 1
+      
+#include "groestl256-hash-4way.h"
+
+#if defined(__VAES__)
+
+/* global constants  */
+__m512i ROUND_CONST_Lx;
+__m512i ROUND_CONST_L0[ROUNDS512];
+__m512i ROUND_CONST_L7[ROUNDS512];
+//__m512i ROUND_CONST_P[ROUNDS1024];
+//__m512i ROUND_CONST_Q[ROUNDS1024];
+__m512i TRANSP_MASK;
+__m512i SUBSH_MASK[8];
+__m512i ALL_1B;
+__m512i ALL_FF;
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  i = _mm512_add_epi8(i, i);\
+  j = _mm512_and_si512(j, k);\
+  i = _mm512_xor_si512(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm512_xor_si512(a0, a1);\
+  b0 = a2;\
+  a1 = _mm512_xor_si512(a1, a2);\
+  b1 = a3;\
+  a2 = _mm512_xor_si512(a2, a3);\
+  b2 = a4;\
+  a3 = _mm512_xor_si512(a3, a4);\
+  b3 = a5;\
+  a4 = _mm512_xor_si512(a4, a5);\
+  b4 = a6;\
+  a5 = _mm512_xor_si512(a5, a6);\
+  b5 = a7;\
+  a6 = _mm512_xor_si512(a6, a7);\
+  a7 = _mm512_xor_si512(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm512_xor_si512(b0, a4);\
+  b6 = _mm512_xor_si512(b6, a4);\
+  b1 = _mm512_xor_si512(b1, a5);\
+  b7 = _mm512_xor_si512(b7, a5);\
+  b2 = _mm512_xor_si512(b2, a6);\
+  b0 = _mm512_xor_si512(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm512_xor_si512(b3, a7);\
+  b1 = _mm512_xor_si512(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm512_xor_si512(b4, a0);\
+  b2 = _mm512_xor_si512(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm512_xor_si512(b5, a1);\
+  b3 = _mm512_xor_si512(b3, a1);\
+  b1 = a1;\
+  b6 = _mm512_xor_si512(b6, a2);\
+  b4 = _mm512_xor_si512(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm512_xor_si512(b7, a3);\
+  b5 = _mm512_xor_si512(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm512_xor_si512(a0, a3);\
+  a1 = _mm512_xor_si512(a1, a4);\
+  a2 = _mm512_xor_si512(a2, a5);\
+  a3 = _mm512_xor_si512(a3, a6);\
+  a4 = _mm512_xor_si512(a4, a7);\
+  a5 = _mm512_xor_si512(a5, b0);\
+  a6 = _mm512_xor_si512(a6, b1);\
+  a7 = _mm512_xor_si512(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2(a0, b0, b1);\
+  a0 = _mm512_xor_si512(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm512_xor_si512(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm512_xor_si512(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm512_xor_si512(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm512_xor_si512(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm512_xor_si512(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm512_xor_si512(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm512_xor_si512(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm512_xor_si512(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm512_xor_si512(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm512_xor_si512(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm512_xor_si512(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm512_xor_si512(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm512_xor_si512(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm512_xor_si512(b0, a3);\
+  b1 = _mm512_xor_si512(b1, a4);\
+}/*MixBytes*/
+
+// calculate the round constants seperately and load at startup
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
+  TRANSP_MASK   = _mm512_set_epi32( \
+                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
+                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
+                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
+                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
+  SUBSH_MASK[0] = _mm512_set_epi32( \
+                         0x33363a3d, 0x38323539, 0x3c3f3134, 0x373b3e30, \
+                         0x23262a2d, 0x28222529, 0x2c2f2124, 0x272b2e20, \
+                         0x13161a1d, 0x18121519, 0x1c1f1114, 0x171b1e10, \
+                         0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00 ); \
+  SUBSH_MASK[1] = _mm512_set_epi32( \
+                         0x34373c3f, 0x3a33363b, 0x3e393235, 0x303d3831, \
+                         0x24272c2f, 0x2a23262b, 0x2e292225, 0x202d2821, \
+                         0x14171c1f, 0x1a13161b, 0x1e191215, 0x101d1801, \
+                         0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801 );\
+  SUBSH_MASK[2] = _mm512_set_epi32( \
+                         0x35303e39, 0x3c34373d, 0x383b3336, 0x313f3a32, \
+                         0x25202e29, 0x2c24272d, 0x282b2326, 0x212f2a22, \
+                         0x15101e19, 0x1c14171d, 0x181b1316, 0x111f1a12, \
+                         0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02 );\
+  SUBSH_MASK[3] = _mm512_set_epi32( \
+                         0x3631383b, 0x3e35303f, 0x3a3d3437, 0x32393c33, \
+                         0x2621282b, 0x2e25202f, 0x2a2d2427, 0x22292c23, \
+                         0x1611181b, 0x1e15101f, 0x1a1d1417, 0x12191c13, \
+                         0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03 );\
+  SUBSH_MASK[4] = _mm512_set_epi32( \
+                         0x3732393c, 0x3f363138, 0x3b3e3530, 0x333a3d34, \
+                         0x2722292c, 0x2f262128, 0x2b2e2520, 0x232a2d24, \
+                         0x1712191c, 0x1f161118, 0x1b1e1510, 0x131a1d14, \
+                         0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04 );\
+  SUBSH_MASK[5] = _mm512_set_epi32( \
+                         0x30333b3e, 0x3937323a, 0x3d383631, 0x343c3f35, \
+                         0x20232b2e, 0x2927222a, 0x2d282621, 0x242c2f25, \
+                         0x10131b1e, 0x1917121a, 0x1d181611, 0x141c1f15, \
+                         0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05 );\
+  SUBSH_MASK[6] = _mm512_set_epi32( \
+                         0x31343d38, 0x3b30333c, 0x3f3a3732, 0x353e3936, \
+                         0x21242d28, 0x2b20232c, 0x2f2a2722, 0x252e2926, \
+                         0x11141d18, 0x1b10131c, 0x1f1a1712, 0x151e1916, \
+                         0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906 );\
+  SUBSH_MASK[7] = _mm512_set_epi32( \
+                         0x32353f3a, 0x3d31343e, 0x393c3033, 0x36383b37, \
+                         0x22252f2a, 0x2d21242e, 0x292c2023, 0x26282b27, \
+                         0x12151f1a, 0x1d11141e, 0x191c1013, 0x16181b17, \
+                         0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07 );\
+  for ( i = 0; i < ROUNDS512; i++ ) \
+  {\
+    ROUND_CONST_L0[i] = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
+          0x70605040 ^ ( i * 0x01010101 ), 0x30201000 ^ ( i * 0x01010101 ) ); \
+    ROUND_CONST_L7[i] = _mm512_set4_epi32( 0x8f9fafbf ^ ( i * 0x01010101 ), \
+          0xcfdfefff ^ ( i * 0x01010101 ), 0x00000000, 0x00000000 ); \
+  }\
+  ROUND_CONST_Lx = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
+                                      0x00000000, 0x00000000 ); \
+}while(0);\
+
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm512_xor_si512( a0, (ROUND_CONST_L0[i]) );\
+  a1 = _mm512_xor_si512( a1, b1 );\
+  a2 = _mm512_xor_si512( a2, b1 );\
+  a3 = _mm512_xor_si512( a3, b1 );\
+  a4 = _mm512_xor_si512( a4, b1 );\
+  a5 = _mm512_xor_si512( a5, b1 );\
+  a6 = _mm512_xor_si512( a6, b1 );\
+  a7 = _mm512_xor_si512( a7, (ROUND_CONST_L7[i]) );\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm512_xor_si512( b0, b0 );\
+  a0 = _mm512_shuffle_epi8( a0, (SUBSH_MASK[0]) );\
+  a0 = _mm512_aesenclast_epi128(a0, b0 );\
+  a1 = _mm512_shuffle_epi8( a1, (SUBSH_MASK[1]) );\
+  a1 = _mm512_aesenclast_epi128(a1, b0 );\
+  a2 = _mm512_shuffle_epi8( a2, (SUBSH_MASK[2]) );\
+  a2 = _mm512_aesenclast_epi128(a2, b0 );\
+  a3 = _mm512_shuffle_epi8( a3, (SUBSH_MASK[3]) );\
+  a3 = _mm512_aesenclast_epi128(a3, b0 );\
+  a4 = _mm512_shuffle_epi8( a4, (SUBSH_MASK[4]) );\
+  a4 = _mm512_aesenclast_epi128(a4, b0 );\
+  a5 = _mm512_shuffle_epi8( a5, (SUBSH_MASK[5]) );\
+  a5 = _mm512_aesenclast_epi128(a5, b0 );\
+  a6 = _mm512_shuffle_epi8( a6, (SUBSH_MASK[6]) );\
+  a6 = _mm512_aesenclast_epi128(a6, b0 );\
+  a7 = _mm512_shuffle_epi8( a7, (SUBSH_MASK[7]) );\
+  a7 = _mm512_aesenclast_epi128( a7, b0 );\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm512_shuffle_epi8( i0, t0 );\
+  i1 = _mm512_shuffle_epi8( i1, t0 );\
+  i2 = _mm512_shuffle_epi8( i2, t0 );\
+  i3 = _mm512_shuffle_epi8( i3, t0 );\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm512_unpacklo_epi16( i0, i1 );\
+  o1 = _mm512_unpackhi_epi16( o1, i1 );\
+  i2 = _mm512_unpacklo_epi16( i2, i3 );\
+  t0 = _mm512_unpackhi_epi16( t0, i3 );\
+  \
+  i0 = _mm512_shuffle_epi32( i0, 216 );\
+  o1 = _mm512_shuffle_epi32( o1, 216 );\
+  i2 = _mm512_shuffle_epi32( i2, 216 );\
+  t0 = _mm512_shuffle_epi32( t0, 216 );\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm512_unpacklo_epi32( i0, i2 );\
+  o1 = _mm512_unpacklo_epi32( o1, t0 );\
+  o2 = _mm512_unpackhi_epi32( o2, i2 );\
+  o3 = _mm512_unpackhi_epi32( o3, t0 );\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm512_unpacklo_epi64( i0, i4 );\
+  o1 = _mm512_unpackhi_epi64( o1, i4 );\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm512_unpacklo_epi64( o2, i5 );\
+  o3 = _mm512_unpackhi_epi64( o3, i5 );\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm512_unpacklo_epi64( o4, i6 );\
+  o5 = _mm512_unpackhi_epi64( o5, i6 );\
+  o7 = i3;\
+  o6 = _mm512_unpacklo_epi64( o6, i7 );\
+  o7 = _mm512_unpackhi_epi64( o7, i7 );\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm512_unpacklo_epi64( i0, i1 );\
+  o0 = _mm512_unpackhi_epi64( o0, i1 );\
+  o1 = i2;\
+  i2 = _mm512_unpacklo_epi64( i2, i3 );\
+  o1 = _mm512_unpackhi_epi64( o1, i3 );\
+  o2 = i4;\
+  i4 = _mm512_unpacklo_epi64( i4, i5 );\
+  o2 = _mm512_unpackhi_epi64( o2, i5 );\
+  o3 = i6;\
+  i6 = _mm512_unpacklo_epi64( i6, i7 );\
+  o3 = _mm512_unpackhi_epi64( o3, i7 );\
+}/**/
+
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm512_xor_si512( t0, t0 );\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm512_unpacklo_epi64( i0, t0 );\
+  i1 = _mm512_unpackhi_epi64( i1, t0 );\
+  i2 = _mm512_unpacklo_epi64( i2, t0 );\
+  i3 = _mm512_unpackhi_epi64( i3, t0 );\
+  i4 = _mm512_unpacklo_epi64( i4, t0 );\
+  i5 = _mm512_unpackhi_epi64( i5, t0 );\
+  i6 = _mm512_unpacklo_epi64( i6, t0 );\
+  i7 = _mm512_unpackhi_epi64( i7, t0 );\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm512_unpacklo_epi64( i0, i1 );\
+  i2 = _mm512_unpacklo_epi64( i2, i3 );\
+  i4 = _mm512_unpacklo_epi64( i4, i5 );\
+  i6 = _mm512_unpacklo_epi64( i6, i7 );\
+}/**/
+
+
+
+void INIT256_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm2, xmm6, xmm7;
+  static __m512i xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512_4way( __m512i* chaining, __m512i* message )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm512_xor_si512( xmm8, xmm12 );
+  xmm0 = _mm512_xor_si512( xmm0, xmm2 );
+  xmm4 = _mm512_xor_si512( xmm4, xmm6 );
+  xmm5 = _mm512_xor_si512( xmm5, xmm7 );
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm512_xor_si512( xmm0, xmm8 );
+  xmm1 = _mm512_xor_si512( xmm1, xmm10 );
+  xmm2 = _mm512_xor_si512( xmm2, xmm12 );
+  xmm3 = _mm512_xor_si512( xmm3, xmm14 );
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm512_xor_si512( xmm0, (chaining[0]) );
+  xmm1 = _mm512_xor_si512( xmm1, (chaining[1]) );
+  xmm2 = _mm512_xor_si512( xmm2, (chaining[2]) );
+  xmm3 = _mm512_xor_si512( xmm3, (chaining[3]) );
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+  return;
+}
+
+void OF512_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8  = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[1]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[2]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[3]) );
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+#endif  // VAES
+#endif  // GROESTL512_INTR_4WAY_H__
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -0,0 +1,146 @@
+/* hash.c     Aug 2011
+ * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt  2019-12.
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+// Optimized for hash and data length that are integrals of __m128i 
+
+
+#include <memory.h>
+#include "groestl512-intr-4way.h"
+#include "miner.h"
+#include "simd-utils.h"
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
+{
+  int i;
+
+  SET_CONSTANTS();
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  memset_zero_512( ctx->chaining, SIZE512 );
+  memset_zero_512( ctx->buffer, SIZE512 );
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
+
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return 0;
+}
+
+int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = 64 / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE512;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+   // --- update ---
+
+   for ( i = 0; i < blocks; i++ )
+      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem; 
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE512 - 1 )
+   {        
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+   }   
+   else
+   {
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+   }
+
+   TF1024_4way( ctx->chaining, ctx->buffer );
+   OF1024_4way( ctx->chaining );
+
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
+                          const void* input, uint64_t datalen )
+{
+   const int len = (int)datalen >> 4;
+   const int hashlen_m128i = 64 >> 4;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   uint64_t blocks = len / SIZE512;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+   // --- init ---
+
+   SET_CONSTANTS();
+   memset_zero_512( ctx->chaining, SIZE512 );
+   memset_zero_512( ctx->buffer, SIZE512 );
+   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;
+
+   // --- update ---
+
+   for ( i = 0; i < blocks; i++ )
+      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
+   i += ctx->rem_ptr;
+
+   // --- close ---
+
+   blocks++;   
+
+   if ( i == SIZE512 - 1 )
+   {
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
+   }
+   else
+   {
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
+   }
+
+   TF1024_4way( ctx->chaining, ctx->buffer );
+   OF1024_4way( ctx->chaining );
+
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+#endif   // VAES
+
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -0,0 +1,62 @@
+#if !defined(GROESTL512_HASH_4WAY_H__)
+#define GROESTL512_HASH_4WAY_H__ 1
+
+#include "simd-utils.h"
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define LENGTH (512)
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+//#define COLS512 (8)
+#define COLS1024 (16)
+//#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE_1024 ((ROWS)*(COLS1024))
+//#define ROUNDS512 (10)
+#define ROUNDS1024 (14)
+
+//#if LENGTH<=256
+//#define COLS (COLS512)
+//#define SIZE (SIZE512)
+//#define ROUNDS (ROUNDS512)
+//#else
+#define COLS (COLS1024)
+//#define SIZE (SIZE1024)
+#define ROUNDS (ROUNDS1024)
+//#endif
+
+#define SIZE512 (SIZE_1024/16)
+
+typedef struct {
+  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
+  __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
+  int rem_ptr;
+  int databitlen;    // bits
+} groestl512_4way_context;
+
+
+int groestl512_4way_init( groestl512_4way_context*, uint64_t );
+
+//int reinit_groestl( hashState_groestl* );
+
+int groestl512_4way_update( groestl512_4way_context*, const void*,
+                              uint64_t );
+int groestl512_4way_close( groestl512_4way_context*, void* );
+int groestl512_4way_update_close( groestl512_4way_context*,  void*,
+                                        const void*, uint64_t );
+int groestl512_4way_full( groestl512_4way_context*,  void*,
+                          const void*, uint64_t );
+
+#endif   // VAES
+#endif   // GROESTL512_HASH_4WAY_H__
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -0,0 +1,654 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+
+#if !defined(GROESTL512_INTR_4WAY_H__)
+#define GROESTL512_INTR_4WAY_H__ 1
+      
+#include "groestl512-hash-4way.h"
+
+#if defined(__VAES__)
+
+/* global constants  */
+__m512i ROUND_CONST_Lx;
+//__m128i ROUND_CONST_L0[ROUNDS512];
+//__m128i ROUND_CONST_L7[ROUNDS512];
+__m512i ROUND_CONST_P[ROUNDS1024];
+__m512i ROUND_CONST_Q[ROUNDS1024];
+__m512i TRANSP_MASK;
+__m512i SUBSH_MASK[8];
+__m512i ALL_1B;
+__m512i ALL_FF;
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  i = _mm512_add_epi8(i, i);\
+  j = _mm512_and_si512(j, k);\
+  i = _mm512_xor_si512(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm512_xor_si512(a0, a1);\
+  b0 = a2;\
+  a1 = _mm512_xor_si512(a1, a2);\
+  b1 = a3;\
+  a2 = _mm512_xor_si512(a2, a3);\
+  b2 = a4;\
+  a3 = _mm512_xor_si512(a3, a4);\
+  b3 = a5;\
+  a4 = _mm512_xor_si512(a4, a5);\
+  b4 = a6;\
+  a5 = _mm512_xor_si512(a5, a6);\
+  b5 = a7;\
+  a6 = _mm512_xor_si512(a6, a7);\
+  a7 = _mm512_xor_si512(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm512_xor_si512(b0, a4);\
+  b6 = _mm512_xor_si512(b6, a4);\
+  b1 = _mm512_xor_si512(b1, a5);\
+  b7 = _mm512_xor_si512(b7, a5);\
+  b2 = _mm512_xor_si512(b2, a6);\
+  b0 = _mm512_xor_si512(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm512_xor_si512(b3, a7);\
+  b1 = _mm512_xor_si512(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm512_xor_si512(b4, a0);\
+  b2 = _mm512_xor_si512(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm512_xor_si512(b5, a1);\
+  b3 = _mm512_xor_si512(b3, a1);\
+  b1 = a1;\
+  b6 = _mm512_xor_si512(b6, a2);\
+  b4 = _mm512_xor_si512(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm512_xor_si512(b7, a3);\
+  b5 = _mm512_xor_si512(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm512_xor_si512(a0, a3);\
+  a1 = _mm512_xor_si512(a1, a4);\
+  a2 = _mm512_xor_si512(a2, a5);\
+  a3 = _mm512_xor_si512(a3, a6);\
+  a4 = _mm512_xor_si512(a4, a7);\
+  a5 = _mm512_xor_si512(a5, b0);\
+  a6 = _mm512_xor_si512(a6, b1);\
+  a7 = _mm512_xor_si512(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2(a0, b0, b1);\
+  a0 = _mm512_xor_si512(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm512_xor_si512(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm512_xor_si512(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm512_xor_si512(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm512_xor_si512(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm512_xor_si512(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm512_xor_si512(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm512_xor_si512(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm512_xor_si512(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm512_xor_si512(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm512_xor_si512(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm512_xor_si512(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm512_xor_si512(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm512_xor_si512(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm512_xor_si512(b0, a3);\
+  b1 = _mm512_xor_si512(b1, a4);\
+}/*MixBytes*/
+
+// calculate the round constants seperately and load at startup
+
+#define SET_CONSTANTS(){\
+  ALL_FF = _mm512_set1_epi32( 0xffffffff );\
+  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
+  TRANSP_MASK   = _mm512_set_epi32( \
+                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
+                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
+                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
+                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
+  SUBSH_MASK[0] = _mm512_set_epi32( \
+                         0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
+                         0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
+                         0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
+                         0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
+  SUBSH_MASK[1] = _mm512_set_epi32( \
+                         0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
+                         0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
+                         0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
+                         0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
+  SUBSH_MASK[2] = _mm512_set_epi32( \
+                         0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
+                         0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
+                         0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
+                         0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
+  SUBSH_MASK[3] = _mm512_set_epi32( \
+                         0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
+                         0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
+                         0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
+                         0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
+  SUBSH_MASK[4] = _mm512_set_epi32( \
+                         0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
+                         0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
+                         0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
+                         0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
+  SUBSH_MASK[5] = _mm512_set_epi32( \
+                         0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
+                         0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
+                         0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
+                         0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
+  SUBSH_MASK[6] = _mm512_set_epi32( \
+                         0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
+                         0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
+                         0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
+                         0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
+  SUBSH_MASK[7] = _mm512_set_epi32( \
+                         0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
+                         0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
+                         0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
+                         0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
+  for( i = 0; i < ROUNDS1024; i++ ) \
+  { \
+    ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
+                                          0xb0a09080 ^ (i * 0x01010101), \
+                                          0x70605040 ^ (i * 0x01010101), \
+                                          0x30201000 ^ (i * 0x01010101) ); \
+    ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
+                                          0x4f5f6f7f ^ (i * 0x01010101), \
+                                          0x8f9fafbf ^ (i * 0x01010101), \
+                                          0xcfdfefff ^ (i * 0x01010101));\
+  } \
+}while(0);\
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes */\
+  b0 = _mm512_xor_si512( b0, b0 );\
+  a0 = _mm512_aesenclast_epi128( a0, b0 );\
+  a1 = _mm512_aesenclast_epi128( a1, b0 );\
+  a2 = _mm512_aesenclast_epi128( a2, b0 );\
+  a3 = _mm512_aesenclast_epi128( a3, b0 );\
+  a4 = _mm512_aesenclast_epi128( a4, b0 );\
+  a5 = _mm512_aesenclast_epi128( a5, b0 );\
+  a6 = _mm512_aesenclast_epi128( a6, b0 );\
+  a7 = _mm512_aesenclast_epi128( a7, b0 );\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
+  { \
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[0] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[1] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+     /* AddRoundConstant P1024 */\
+    xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
+    /* SubBytes + MixBytes */\
+     SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+#define ROUNDS_Q(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2) \
+  { \
+    /* AddRoundConstant Q1024 */\
+    xmm1 = m512_neg1;\
+    xmm8  = _mm512_xor_si512( xmm8,  xmm1 );\
+    xmm9  = _mm512_xor_si512( xmm9,  xmm1 );\
+    xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
+    xmm11 = _mm512_xor_si512( xmm11, xmm1 );\
+    xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
+    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
+    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
+    xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[1] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[3] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = m512_neg1;\
+    xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
+    xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
+    xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
+    xmm3 = _mm512_xor_si512( xmm3, xmm9 );\
+    xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
+    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
+    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
+    xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK;\
+\
+  i6 = _mm512_shuffle_epi8(i6, t0);\
+  i0 = _mm512_shuffle_epi8(i0, t0);\
+  i1 = _mm512_shuffle_epi8(i1, t0);\
+  i2 = _mm512_shuffle_epi8(i2, t0);\
+  i3 = _mm512_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm512_shuffle_epi8(i4, t0);\
+  i5 = _mm512_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm512_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm512_unpackhi_epi16(t2, i5);\
+  i4 = _mm512_unpacklo_epi16(i4, i5);\
+  t3 = _mm512_unpackhi_epi16(t3, i7);\
+  i6 = _mm512_unpacklo_epi16(i6, i7);\
+  t0 = _mm512_unpackhi_epi16(t0, i1);\
+  t1 = _mm512_unpackhi_epi16(t1, i3);\
+  i2 = _mm512_unpacklo_epi16(i2, i3);\
+  i0 = _mm512_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm512_shuffle_epi32(t0, 216);\
+  t1 = _mm512_shuffle_epi32(t1, 216);\
+  t2 = _mm512_shuffle_epi32(t2, 216);\
+  t3 = _mm512_shuffle_epi32(t3, 216);\
+  i0 = _mm512_shuffle_epi32(i0, 216);\
+  i2 = _mm512_shuffle_epi32(i2, 216);\
+  i4 = _mm512_shuffle_epi32(i4, 216);\
+  i6 = _mm512_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm512_unpacklo_epi32(i0, i2);\
+  t4 = _mm512_unpackhi_epi32(t4, i2);\
+  t5 = t0;\
+  t0 = _mm512_unpacklo_epi32(t0, t1);\
+  t5 = _mm512_unpackhi_epi32(t5, t1);\
+  t6 = i4;\
+  i4 = _mm512_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm512_unpackhi_epi32(t6, i6);\
+  i2 = t0;\
+  t2 = _mm512_unpacklo_epi32(t2, t3);\
+  i3 = t0;\
+  t7 = _mm512_unpackhi_epi32(t7, t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm512_unpackhi_epi64(i1, i4);\
+  i0 = _mm512_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm512_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm512_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm512_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm512_unpacklo_epi64(i4, t6);\
+  i7 = _mm512_unpackhi_epi64(i7, t7);\
+  i6 = _mm512_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm512_unpacklo_epi64(i0, i1);\
+  o1 = _mm512_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm512_unpacklo_epi64(i2, i3);\
+  t0 = _mm512_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm512_unpacklo_epi64(i4, i5);\
+  t1 = _mm512_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = TRANSP_MASK;\
+  i6 = _mm512_unpacklo_epi64(i6, i7);\
+  t2 = _mm512_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm512_shuffle_epi8(i0, o0);\
+  i2 = _mm512_shuffle_epi8(i2, o0);\
+  i4 = _mm512_shuffle_epi8(i4, o0);\
+  i6 = _mm512_shuffle_epi8(i6, o0);\
+  o1 = _mm512_shuffle_epi8(o1, o0);\
+  t0 = _mm512_shuffle_epi8(t0, o0);\
+  t1 = _mm512_shuffle_epi8(t1, o0);\
+  t2 = _mm512_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm512_unpackhi_epi16(t3, i6);\
+  i4 = _mm512_unpacklo_epi16(i4, i6);\
+  o0 = _mm512_unpackhi_epi16(o0, i2);\
+  i0 = _mm512_unpacklo_epi16(i0, i2);\
+  o2 = _mm512_unpackhi_epi16(o2, t0);\
+  o1 = _mm512_unpacklo_epi16(o1, t0);\
+  t4 = _mm512_unpackhi_epi16(t4, t2);\
+  t1 = _mm512_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm512_shuffle_epi32(i4, 216);\
+  t3 = _mm512_shuffle_epi32(t3, 216);\
+  o1 = _mm512_shuffle_epi32(o1, 216);\
+  o2 = _mm512_shuffle_epi32(o2, 216);\
+  i0 = _mm512_shuffle_epi32(i0, 216);\
+  o0 = _mm512_shuffle_epi32(o0, 216);\
+  t1 = _mm512_shuffle_epi32(t1, 216);\
+  t4 = _mm512_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm512_unpacklo_epi32(i0, i4);\
+  i1 = _mm512_unpackhi_epi32(i1, i4);\
+  o0 = _mm512_unpacklo_epi32(o0, t3);\
+  i3 = _mm512_unpackhi_epi32(i3, t3);\
+  o1 = _mm512_unpacklo_epi32(o1, t1);\
+  i5 = _mm512_unpackhi_epi32(i5, t1);\
+  o2 = _mm512_unpacklo_epi32(o2, t4);\
+  i7 = _mm512_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+
+void INIT_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024_4way( __m512i* chaining, const __m512i* message )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i QTEMP[8];
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+void OF1024_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+#endif  // VAES
+#endif  // GROESTL512_INTR_4WAY_H__
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,22 +1,20 @@
 #include "myrgr-gate.h"
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
-#ifdef NO_AES_NI
-  #include "sph_groestl.h"
-#else
+#ifdef __AES__
  #include "aes_ni/hash-groestl.h"
+#else
+  #include "sph_groestl.h"
 #endif
 #include <openssl/sha.h>

 typedef struct {
-#ifdef NO_AES_NI
-    sph_groestl512_context  groestl;
-#else
+#ifdef __AES__
    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
 #endif
    SHA256_CTX              sha;
 } myrgr_ctx_holder;
@@ -25,10 +23,10 @@ myrgr_ctx_holder myrgr_ctx;

 void init_myrgr_ctx()
 {
-#ifdef NO_AES_NI
-     sph_groestl512_init( &myrgr_ctx.groestl );
-#else
+#ifdef __AES__
     init_groestl ( &myrgr_ctx.groestl, 64 );
+#else
+     sph_groestl512_init( &myrgr_ctx.groestl );
 #endif
     SHA256_Init( &myrgr_ctx.sha );
 }
@@ -40,12 +38,12 @@ void myriad_hash(void *output, const void *input)

 	uint32_t _ALIGN(32) hash[16];

-#ifdef NO_AES_NI
-	sph_groestl512(&ctx.groestl, input, 80);
-	sph_groestl512_close(&ctx.groestl, hash);
-#else
+#ifdef __AES__
   update_groestl( &ctx.groestl, (char*)input, 640 );
   final_groestl( &ctx.groestl, (char*)hash);
+#else
+	sph_groestl512(&ctx.groestl, input, 80);
+	sph_groestl512_close(&ctx.groestl, hash);
 #endif

   SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -1,14 +1,159 @@
 #include "myrgr-gate.h"
-
-#if defined(MYRGR_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
 #include "aes_ni/hash-groestl.h"
 #include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "groestl512-hash-4way.h"
+#endif
+
+#if defined(MYRGR_8WAY)
+
+typedef struct {
+#if defined(__VAES__)
+   groestl512_4way_context groestl;
+#else
+   hashState_groestl       groestl;
+#endif
+   sha256_8way_context     sha;
+} myrgr_8way_ctx_holder;
+
+myrgr_8way_ctx_holder myrgr_8way_ctx;
+
+void init_myrgr_8way_ctx()
+{
+#if defined(__VAES__)
+     groestl512_4way_init( &myrgr_8way_ctx.groestl, 64 );
+#else
+     init_groestl( &myrgr_8way_ctx.groestl, 64 );
+#endif
+     sha256_8way_init( &myrgr_8way_ctx.sha );
+}
+
+void myriad_8way_hash( void *output, const void *input )
+{
+     uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vhashA[20*8] __attribute__ ((aligned (64)));
+     uint32_t vhashB[20*8] __attribute__ ((aligned (64)));
+     myrgr_8way_ctx_holder ctx;
+     memcpy( &ctx, &myrgr_8way_ctx, sizeof(myrgr_8way_ctx) );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
+
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t hash4[20] __attribute__ ((aligned (64)));
+     uint32_t hash5[20] __attribute__ ((aligned (64)));
+     uint32_t hash6[20] __attribute__ ((aligned (64)));
+     uint32_t hash7[20] __attribute__ ((aligned (64)));
+
+//     rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                       hash6, hash7 );
+
+#else
+
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t hash4[20] __attribute__ ((aligned (64)));
+     uint32_t hash5[20] __attribute__ ((aligned (64)));  
+     uint32_t hash6[20] __attribute__ ((aligned (64)));
+     uint32_t hash7[20] __attribute__ ((aligned (64)));
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3,
+                   hash4, hash5, hash6, hash7, input, 640 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
+                         hash4, hash5, hash6, hash7, 512 );
+
+#endif
+
+     sha256_8way_update( &ctx.sha, vhash, 64 );
+     sha256_8way_close( &ctx.sha, output );
+}
+
+int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_4x128( vdata, pdata );
+
+   do
+   {
+      be32enc( noncep,    n   );
+      be32enc( noncep+ 8, n+1 );
+      be32enc( noncep+16, n+2 );
+      be32enc( noncep+24, n+3 );
+      be32enc( noncep+32, n+4 );
+      be32enc( noncep+40, n+5 );
+      be32enc( noncep+48, n+6 );
+      be32enc( noncep+64, n+7 );
+
+      myriad_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash7[ lane ] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(MYRGR_4WAY)

 typedef struct {
    hashState_groestl       groestl;
@@ -45,7 +190,7 @@ void myriad_4way_hash( void *output, const void *input )

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-     sha256_4way( &ctx.sha, vhash, 64 );
+     sha256_4way_update( &ctx.sha, vhash, 64 );
     sha256_4way_close( &ctx.sha, output );
 }

--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -2,16 +2,22 @@

 bool register_myriad_algo( algo_gate_t* gate )
 {
-#if defined (MYRGR_4WAY)
+#if defined (MYRGR_8WAY)
+  init_myrgr_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad_8way;
+  gate->hash      = (void*)&myriad_8way_hash;
+  gate->optimizations = AES_OPT | AVX2_OPT | VAES_OPT;
+#elif defined (MYRGR_4WAY)
  init_myrgr_4way_ctx();
  gate->scanhash  = (void*)&scanhash_myriad_4way;
  gate->hash      = (void*)&myriad_4way_hash;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | VAES_OPT;
 #else
  init_myrgr_ctx();
  gate->scanhash  = (void*)&scanhash_myriad;
  gate->hash      = (void*)&myriad_hash;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
 #endif
-  gate->optimizations = AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -1,30 +1,35 @@
 #ifndef MYRGR_GATE_H__
-#define MYRGR_GATE_H__
+#define MYRGR_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
-  #define MYRGR_4WAY
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define MYRGR_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
+  #define MYRGR_4WAY 1
 #endif

-#if defined(MYRGR_4WAY)
+#if defined(MYRGR_8WAY)
+
+void myriad_8way_hash( void *state, const void *input );
+int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_myrgr_8way_ctx();
+
+#elif defined(MYRGR_4WAY)

 void myriad_4way_hash( void *state, const void *input );
-
 int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_myrgr_4way_ctx();

-#endif
+#else

 void myriad_hash( void *state, const void *input );
-
 int scanhash_myriad( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_myrgr_ctx();

 #endif
-
+#endif
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -1171,7 +1171,8 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
   sc->h[7] = m256_const1_64( 0x6769756d2042656c );
 }

-void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
+void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
+      size_t len )
 {
   __m256i *vdata = (__m256i*)data;

--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -62,7 +62,7 @@ typedef hamsi_4way_big_context hamsi512_4way_context;
 void hamsi512_4way_init( hamsi512_4way_context *sc );
 void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
      size_t len );
-#define hamsi512_4way hamsi512_4way_update
+//#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -38,7 +38,7 @@
 #define SPH_XCAT_(a, b)   a ## b

 static void
-SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
 ( haval_4way_context *sc, const void *data, size_t len )
 {
   __m128i *vdata = (__m128i*)data;
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \
 } \
 \
 void \
-haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
+haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \
 { \
-	haval ## y ## _4way(cc, data, len); \
+	haval ## y ## _4way_update(cc, data, len); \
 } \
 \
 void \
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -85,7 +85,7 @@ typedef haval_4way_context haval256_5_4way_context;
 void haval256_5_4way_init( void *cc );

 void haval256_5_4way_update( void *cc, const void *data, size_t len );
-#define haval256_5_4way haval256_5_4way_update
+//#define haval256_5_4way haval256_5_4way_update

 void haval256_5_4way_close( void *cc, void *dst );

--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,13 +1,10 @@
 #include "algo-gate-api.h"
-
 #include <stdio.h>
 #include <string.h>
 #include <openssl/sha.h>
 #include <stdint.h>
 #include <stdlib.h>
-
 #include "sph_hefty1.h"
-
 #include "algo/luffa/sph_luffa.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/skein/sph_skein.h"
@@ -16,9 +13,7 @@
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/skein/sse2/skein.c"
-
-#ifndef NO_AES_NI
+#ifdef __AES__
  #include "algo/echo/aes_ni/hash_api.h"
 #endif

@@ -26,29 +21,23 @@ void bastionhash(void *output, const void *input)
 {
 	unsigned char hash[64] __attribute__ ((aligned (64)));

-#ifdef NO_AES_NI
-        sph_echo512_context     ctx_echo;
+#ifdef __AES__
+   hashState_echo          ctx_echo;
 #else
-        hashState_echo          ctx_echo;
+   sph_echo512_context     ctx_echo;
 #endif
-        hashState_luffa         ctx_luffa;
+   hashState_luffa         ctx_luffa;
 	sph_fugue512_context ctx_fugue;
 	sph_whirlpool_context ctx_whirlpool;
 	sph_shabal512_context ctx_shabal;
-	sph_hamsi512_context ctx_hamsi;
-
-        unsigned char hashbuf[128] __attribute__ ((aligned (16)));
-        sph_u64 hashctA;
-//        sph_u64 hashctB;
-        size_t hashptr;
+   sph_hamsi512_context ctx_hamsi;
+	sph_skein512_context ctx_skein;

 	HEFTY1(input, 80, hash);

        init_luffa( &ctx_luffa, 512 );
        update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );
-//        update_luffa( &ctx_luffa, hash, 64 );
-//        final_luffa( &ctx_luffa, hash );

 	if (hash[0] & 0x8)
 	{
@@ -56,10 +45,9 @@ void bastionhash(void *output, const void *input)
 		sph_fugue512(&ctx_fugue, hash, 64);
 		sph_fugue512_close(&ctx_fugue, hash);
 	} else {
-                DECL_SKN;
-                SKN_I;
-                SKN_U;
-                SKN_C;
+   sph_skein512_init( &ctx_skein );
+   sph_skein512( &ctx_skein, hash, 64 );
+   sph_skein512_close( &ctx_skein, hash );
 	}

 	sph_whirlpool_init(&ctx_whirlpool);
@@ -72,33 +60,28 @@ void bastionhash(void *output, const void *input)

 	if (hash[0] & 0x8)
 	{
-#ifdef NO_AES_NI
+#ifdef __AES__
+      init_echo( &ctx_echo, 512 );
+      update_final_echo ( &ctx_echo,(BitSequence*)hash,
+                              (const BitSequence*)hash, 512 );
+#else
 		sph_echo512_init(&ctx_echo);
 		sph_echo512(&ctx_echo, hash, 64);
 		sph_echo512_close(&ctx_echo, hash);
-#else
-                init_echo( &ctx_echo, 512 );
-                update_final_echo ( &ctx_echo,(BitSequence*)hash,
-                                    (const BitSequence*)hash, 512 );
-//                update_echo ( &ctx_echo, hash, 512 );
-//                final_echo( &ctx_echo,  hash );
 #endif
 	} else {
-                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
+      init_luffa( &ctx_luffa, 512 );
+      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
+                                    (const BitSequence*)hash, 64 );
 	}

 	sph_shabal512_init(&ctx_shabal);
 	sph_shabal512(&ctx_shabal, hash, 64);
 	sph_shabal512_close(&ctx_shabal, hash);

-        DECL_SKN;
-        SKN_I;
-        SKN_U;
-        SKN_C;
+   sph_skein512_init( &ctx_skein );
+   sph_skein512( &ctx_skein, hash, 64 );
+   sph_skein512_close( &ctx_skein, hash );

 	if (hash[0] & 0x8)
 	{
@@ -121,11 +104,9 @@ void bastionhash(void *output, const void *input)
 		sph_hamsi512(&ctx_hamsi, hash, 64);
 		sph_hamsi512_close(&ctx_hamsi, hash);
 	} else {
-                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
+      init_luffa( &ctx_luffa, 512 );
+      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
+                                    (const BitSequence*)hash, 64 );
 	}

 	memcpy(output, hash, 32);
@@ -152,10 +133,8 @@ int scanhash_bastion( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], n);
 		bastionhash(hash32, endiandata);
 		if (hash32[7] < Htarg && fulltest(hash32, ptarget)) {
-			work_set_target_ratio(work, hash32);
-			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
-			return true;
+         submit_solution( work, hash32, mythr );
 		}
 		n++;

--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -161,7 +161,7 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations         = SSE42_OPT | AES_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -41,57 +41,10 @@
 extern "C"{
 #endif

-
-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
-#define SPH_SMALL_FOOTPRINT_JH   1
-#endif
-
-#if !defined SPH_JH_64 && SPH_64_TRUE
-#define SPH_JH_64   1
-#endif
-
-#if !SPH_64
-#undef SPH_JH_64
-#endif
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

-/*
- * The internal bitslice representation may use either big-endian or
- * little-endian (true bitslice operations do not care about the bit
- * ordering, and the bit-swapping linear operations in JH happen to
- * be invariant through endianness-swapping). The constants must be
- * defined according to the chosen endianness; we use some
- * byte-swapping macros for that.
- */
-
-#if SPH_LITTLE_ENDIAN
-
-#if SPH_64
-#define C64e(x)     ((SPH_C64(x) >> 56) \
-                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
-                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
-                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
-                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
-                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
-                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
-                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
-#define dec64e_aligned   sph_dec64le_aligned
-#define enc64e           sph_enc64le
-#endif
-
-#else
-
-#if SPH_64
-#define C64e(x)     SPH_C64(x)
-#define dec64e_aligned   sph_dec64be_aligned
-#define enc64e           sph_enc64be
-#endif
-
-#endif
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define Sb_8W(x0, x1, x2, x3, c) \
@@ -152,8 +105,97 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-#if SPH_JH_64
+static const uint64_t C[] =
+{
+   0x67f815dfa2ded572, 0x571523b70a15847b,
+   0xf6875a4d90d6ab81, 0x402bd1c3c54f9f4e,
+   0x9cfa455ce03a98ea, 0x9a99b26699d2c503,
+   0x8a53bbf2b4960266, 0x31a2db881a1456b5,
+   0xdb0e199a5c5aa303, 0x1044c1870ab23f40,
+   0x1d959e848019051c, 0xdccde75eadeb336f,
+   0x416bbf029213ba10, 0xd027bbf7156578dc,
+   0x5078aa3739812c0a, 0xd3910041d2bf1a3f,
+   0x907eccf60d5a2d42, 0xce97c0929c9f62dd,
+   0xac442bc70ba75c18, 0x23fcc663d665dfd1,
+   0x1ab8e09e036c6e97, 0xa8ec6c447e450521,
+   0xfa618e5dbb03f1ee, 0x97818394b29796fd,
+   0x2f3003db37858e4a, 0x956a9ffb2d8d672a,
+   0x6c69b8f88173fe8a, 0x14427fc04672c78a,
+   0xc45ec7bd8f15f4c5, 0x80bb118fa76f4475,
+   0xbc88e4aeb775de52, 0xf4a3a6981e00b882,
+   0x1563a3a9338ff48e, 0x89f9b7d524565faa,
+   0xfde05a7c20edf1b6, 0x362c42065ae9ca36,
+   0x3d98fe4e433529ce, 0xa74b9a7374f93a53,
+   0x86814e6f591ff5d0, 0x9f5ad8af81ad9d0e,
+   0x6a6234ee670605a7, 0x2717b96ebe280b8b,
+   0x3f1080c626077447, 0x7b487ec66f7ea0e0,
+   0xc0a4f84aa50a550d, 0x9ef18e979fe7e391,
+   0xd48d605081727686, 0x62b0e5f3415a9e7e,
+   0x7a205440ec1f9ffc, 0x84c9f4ce001ae4e3,
+   0xd895fa9df594d74f, 0xa554c324117e2e55,
+   0x286efebd2872df5b, 0xb2c4a50fe27ff578,
+   0x2ed349eeef7c8905, 0x7f5928eb85937e44,
+   0x4a3124b337695f70, 0x65e4d61df128865e,
+   0xe720b95104771bc7, 0x8a87d423e843fe74,
+   0xf2947692a3e8297d, 0xc1d9309b097acbdd,
+   0xe01bdc5bfb301b1d, 0xbf829cf24f4924da,
+   0xffbf70b431bae7a4, 0x48bcf8de0544320d,
+   0x39d3bb5332fcae3b, 0xa08b29e0c1c39f45,
+   0x0f09aef7fd05c9e5, 0x34f1904212347094,
+   0x95ed44e301b771a2, 0x4a982f4f368e3be9,
+   0x15f66ca0631d4088, 0xffaf52874b44c147,
+   0x30c60ae2f14abb7e, 0xe68c6eccc5b67046,
+   0x00ca4fbd56a4d5a4, 0xae183ec84b849dda,
+   0xadd1643045ce5773, 0x67255c1468cea6e8,
+   0x16e10ecbf28cdaa3, 0x9a99949a5806e933,
+   0x7b846fc220b2601f, 0x1885d1a07facced1,
+   0xd319dd8da15b5932, 0x46b4a5aac01c9a50,
+   0xba6b04e467633d9f, 0x7eee560bab19caf6,
+   0x742128a9ea79b11f, 0xee51363b35f7bde9,
+   0x76d350755aac571d, 0x01707da3fec2463a,
+   0x42d8a498afc135f7, 0x79676b9e20eced78,
+   0xa8db3aea15638341, 0x832c83324d3bc3fa,
+   0xf347271c1f3b40a7, 0x9a762db734f04059,
+   0xfd4f21d26c4e3ee7, 0xef5957dc398dfdb8,
+   0xdaeb492b490c9b8d, 0x0d70f36849d7a25b,
+   0x84558d7ad0ae3b7d, 0x658ef8e4f0e9a5f5,
+   0x533b1036f4a2b8a0, 0x5aec3e759e07a80c,
+   0x4f88e85692946891, 0x4cbcbaf8555cb05b,
+   0x7b9487f3993bbbe3, 0x5d1c6b72d6f4da75,
+   0x6db334dc28acae64, 0x71db28b850a5346c,
+   0x2a518d10f2e261f8, 0xfc75dd593364dbe3,
+   0xa23fce43f1bcac1c, 0xb043e8023cd1bb67,
+   0x75a12988ca5b0a33, 0x5c5316b44d19347f,
+   0x1e4d790ec3943b92, 0x3fafeeb6d7757479,
+   0x21391abef7d4a8ea, 0x5127234c097ef45c,
+   0xd23c32ba5324a326, 0xadd5a66d4a17a344,
+   0x08c9f2afa63e1db5, 0x563c6b91983d5983,
+   0x4d608672a17cf84c, 0xf6c76e08cc3ee246,
+   0x5e76bcb1b333982f, 0x2ae6c4efa566d62b,
+   0x36d4c1bee8b6f406, 0x6321efbc1582ee74,
+   0x69c953f40d4ec1fd, 0x26585806c45a7da7,
+   0x16fae0061614c17e, 0x3f9d63283daf907e,
+   0x0cd29b00e3f2c9d2, 0x300cd4b730ceaa5f,
+   0x9832e0f216512a74, 0x9af8cee3d830eb0d,
+   0x9279f1b57b9ec54b, 0xd36886046ee651ff,
+   0x316796e6574d239b, 0x05750a17f3a6e6cc,
+   0xce6c3213d98176b1, 0x62a205f88452173c,
+   0x47154778b3cb2bf4, 0x486a9323825446ff,
+   0x65655e4e0758df38, 0x8e5086fc897cfcf2,
+   0x86ca0bd0442e7031, 0x4e477830a20940f0,
+   0x8338f7d139eea065, 0xbd3a2ce437e95ef7,
+   0x6ff8130126b29721, 0xe7de9fefd1ed44a3,
+   0xd992257615dfa08b, 0xbe42dc12f6f7853c,
+   0x7eb027ab7ceca7d8, 0xdea83eaada7d8d53,
+   0xd86902bd93ce25aa, 0xf908731afd43f65a,
+   0xa5194a17daef5fc0, 0x6a21fd4c33664d97,
+   0x701541db3198b435, 0x9b54cdedbb0f1eea,
+   0x72409751a163d09a, 0xe26f4791bf9d75f6
+};

+// Big endian version
+
+/*
 static const sph_u64 C[] = {
 	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
 	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
@@ -240,6 +282,7 @@ static const sph_u64 C[] = {
 	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
 	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
 };
+*/

 #define Ceven_hi(r)   (C[((r) << 2) + 0])
 #define Ceven_lo(r)   (C[((r) << 2) + 1])
@@ -427,7 +470,7 @@ do { \
   h7h = _mm256_xor_si256( h7h, m3h ); \
   h7l = _mm256_xor_si256( h7l, m3l ); \

-
+/*
 static const sph_u64 IV256[] = {
 	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
 	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
@@ -450,11 +493,8 @@ static const sph_u64 IV512[] = {
 	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
 	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
 };
+*/

-#else
-
-
-#endif

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -484,57 +524,6 @@ static const sph_u64 IV512[] = {
 		W ## ro(h7); \
 	} while (0)

-#if SPH_SMALL_FOOTPRINT_JH
-
-#if SPH_JH_64
-
-/*
- * The "small footprint" 64-bit version just uses a partially unrolled
- * loop.
- */
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
-#define E8_8W   do { \
-      unsigned r; \
-      for (r = 0; r < 42; r += 7) { \
-         SL_8W(0); \
-         SL_8W(1); \
-         SL_8W(2); \
-         SL_8W(3); \
-         SL_8W(4); \
-         SL_8W(5); \
-         SL_8W(6); \
-      } \
-   } while (0)
-
-#endif
-
-#define E8   do { \
-		unsigned r; \
-		for (r = 0; r < 42; r += 7) { \
-			SL(0); \
-			SL(1); \
-			SL(2); \
-			SL(3); \
-			SL(4); \
-			SL(5); \
-			SL(6); \
-		} \
-	} while (0)
-
-#else
-
-
-#endif
-
-#else
-
-#if SPH_JH_64
-
-/*
- * On a "true 64-bit" architecture, we can unroll at will.
- */

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -585,6 +574,7 @@ static const sph_u64 IV512[] = {

 #endif  // AVX512

+
 #define E8   do { \
      SLu( 0, 0); \
      SLu( 1, 1); \
@@ -630,13 +620,6 @@ static const sph_u64 IV512[] = {
      SLu(41, 6); \
   } while (0)

-#else
-
-
-#endif
-
-#endif
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 void jh256_8way_init( jh_8way_context *sc )
@@ -732,12 +715,12 @@ jh_8way_core( jh_8way_context *sc, const void *data, size_t len )

 static void
 jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32, const void *iv )
+               size_t out_size_w32 )
 {
   __m512i buf[16*4];
   __m512i *dst512 = (__m512i*)dst;
   size_t numz, u;
-   sph_u64 l0, l1, l0e, l1e;
+   uint64_t l0, l1;

   buf[0] = m512_const1_64( 0x80ULL );

@@ -748,12 +731,10 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_512( buf+1, (numz>>3) - 1 );

-   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
-   l1 = SPH_T64(sc->block_count >> 55);
-   sph_enc64be( &l0e, l0 );
-   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm512_set1_epi64( l1e );
-   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
+   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
+   l1 = ( sc->block_count >> 55 );
+   *(buf + (numz>>3)    ) = _mm512_set1_epi64( bswap_64( l1 ) );
+   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( bswap_64( l0 ) );

   jh_8way_core( sc, buf, numz + 16 );

@@ -772,7 +753,7 @@ jh256_8way_update(void *cc, const void *data, size_t len)
 void
 jh256_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 8, IV256);
+   jh_8way_close(cc, 0, 0, dst, 8);
 }

 void
@@ -784,7 +765,7 @@ jh512_8way_update(void *cc, const void *data, size_t len)
 void
 jh512_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 16, IV512);
+   jh_8way_close(cc, 0, 0, dst, 16);
 }

 #endif
@@ -882,12 +863,12 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )

 static void
 jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32, const void *iv )
+               size_t out_size_w32 )
 {
   __m256i buf[16*4];
   __m256i *dst256 = (__m256i*)dst;
   size_t numz, u;
-   sph_u64 l0, l1, l0e, l1e;
+   uint64_t l0, l1;

   buf[0] = m256_const1_64( 0x80ULL );

@@ -898,12 +879,10 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_256( buf+1, (numz>>3) - 1 );   

-   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
-   l1 = SPH_T64(sc->block_count >> 55);
-   sph_enc64be( &l0e, l0 );
-   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( l1e );
-   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); 
+   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
+   l1 = ( sc->block_count >> 55 );
+   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( bswap_64( l1 ) );
+   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( bswap_64( l0 ) );

   jh_4way_core( sc, buf, numz + 16 );

@@ -922,7 +901,7 @@ jh256_4way_update(void *cc, const void *data, size_t len)
 void
 jh256_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 8, IV256);
+	jh_4way_close(cc, 0, 0, dst, 8 );
 }

 void
@@ -934,7 +913,7 @@ jh512_4way_update(void *cc, const void *data, size_t len)
 void
 jh512_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 16, IV512);
+	jh_4way_close(cc, 0, 0, dst, 16 );
 }


--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -43,7 +43,6 @@ extern "C"{
 #endif

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_jh256   256
@@ -103,14 +102,12 @@ typedef jh_4way_context jh512_4way_context;
 void jh256_4way_init( jh_4way_context *sc);

 void jh256_4way_update(void *cc, const void *data, size_t len);
-#define jh256_4way jh256_4way_update

 void jh256_4way_close(void *cc, void *dst);

 void jh512_4way_init( jh_4way_context *sc );

 void jh512_4way_update(void *cc, const void *data, size_t len);
-#define jh512_4way jh512_4way_update

 void jh512_4way_close(void *cc, void *dst);

--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -33,7 +33,7 @@ void jha_hash_4way( void *out, const void *input )
    keccak512_4way_context ctx_keccak;

    keccak512_4way_init( &ctx_keccak );
-    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_update( &ctx_keccak, input, 80 );
    keccak512_4way_close( &ctx_keccak, vhash );

    // Heavy & Light Pair Loop
@@ -58,18 +58,18 @@ void jha_hash_4way( void *out, const void *input )
       intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

       skein512_4way_init( &ctx_skein );
-       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_update( &ctx_skein, vhash, 64 );
       skein512_4way_close( &ctx_skein, vhashB );

       for ( int i = 0; i < 8; i++ )
          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

       blake512_4way_init( &ctx_blake );
-       blake512_4way( &ctx_blake, vhash, 64 );
+       blake512_4way_update( &ctx_blake, vhash, 64 );
       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
-       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_update( &ctx_jh, vhash, 64 );
       jh512_4way_close( &ctx_jh, vhashB );

       for ( int i = 0; i < 8; i++ )
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -1,19 +1,16 @@
 #include "jha-gate.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/blake/sph_blake.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-
-#ifdef NO_AES_NI
-  #include "algo/groestl/sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
 #endif

 static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64)));
@@ -28,12 +25,12 @@ void jha_hash(void *output, const void *input)
 {
 	uint8_t _ALIGN(128) hash[64];

-#ifdef NO_AES_NI
-	sph_groestl512_context ctx_groestl;
+#ifdef __AES__
+   hashState_groestl      ctx_groestl;
 #else
-        hashState_groestl      ctx_groestl;
+	sph_groestl512_context ctx_groestl;
 #endif
-        sph_blake512_context ctx_blake;
+   sph_blake512_context ctx_blake;
 	sph_jh512_context ctx_jh;
 	sph_keccak512_context ctx_keccak;
 	sph_skein512_context ctx_skein;
@@ -46,36 +43,36 @@ void jha_hash(void *output, const void *input)
 	for (int round = 0; round < 3; round++)
 	{
 	   if (hash[0] & 0x01)
-           {
-#ifdef NO_AES_NI
-		sph_groestl512_init(&ctx_groestl);
-		sph_groestl512(&ctx_groestl, hash, 64 );
-		sph_groestl512_close(&ctx_groestl, hash );
+      {
+#ifdef __AES__
+         init_groestl( &ctx_groestl, 64 );
+         update_and_final_groestl( &ctx_groestl, (char*)hash,
+                                              (char*)hash, 512 );
 #else
-                init_groestl( &ctx_groestl, 64 );
-                update_and_final_groestl( &ctx_groestl, (char*)hash,
-                                          (char*)hash, 512 );
+   		sph_groestl512_init(&ctx_groestl);
+	   	sph_groestl512(&ctx_groestl, hash, 64 );
+		   sph_groestl512_close(&ctx_groestl, hash );
 #endif
-	    }
-            else
-            {
-		sph_skein512_init(&ctx_skein);
-		sph_skein512(&ctx_skein, hash, 64);
-		sph_skein512_close(&ctx_skein, hash );
-	    }
+      }
+      else
+      {
+		   sph_skein512_init(&ctx_skein);
+		   sph_skein512(&ctx_skein, hash, 64);
+		   sph_skein512_close(&ctx_skein, hash );
+	   }

-	    if (hash[0] & 0x01)
-            {
-		sph_blake512_init(&ctx_blake);
-		sph_blake512(&ctx_blake, hash, 64);
-		sph_blake512_close(&ctx_blake, hash );
-	    }
-            else
-            {
-		sph_jh512_init(&ctx_jh);
-		sph_jh512(&ctx_jh, hash, 64 );
-		sph_jh512_close(&ctx_jh, hash );
-	    }
+	   if (hash[0] & 0x01)
+      {
+		   sph_blake512_init(&ctx_blake);
+		   sph_blake512(&ctx_blake, hash, 64);
+		   sph_blake512_close(&ctx_blake, hash );
+	   }
+      else
+      {
+		   sph_jh512_init(&ctx_jh);
+		   sph_jh512(&ctx_jh, hash, 64 );
+		   sph_jh512_close(&ctx_jh, hash );
+	   }
 	}

 	memcpy(output, hash, 32);
@@ -117,9 +114,6 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,

        jha_kec_midstate( endiandata );

-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
 	for (int m=0; m < 6; m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -127,25 +121,9 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
 				pdata[19] = ++n;
 				be32enc(&endiandata[19], n);
 				jha_hash(hash32, endiandata);
-#ifndef DEBUG_ALGO
-				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
-					work_set_target_ratio(work, hash32);
-					*hashes_done = n - first_nonce + 1;
-					return 1;
-				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash32[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash32, ptarget)) {
-						work_set_target_ratio(work, hash32);
-						*hashes_done = n - first_nonce + 1;
-						return 1;
-					}
-				}
-#endif
+				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) 
+               submit_solution( work, hash32, mythr );
 			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
--- a/algo/jh/sse2/jh.c
+++ b/algo/jh/sse2/jh.c
--- a/algo/jh/sse2/jh_sse2_opt32.h
+++ b/algo/jh/sse2/jh_sse2_opt32.h
@@ -1,465 +0,0 @@
-/* This program gives the optimized SSE2 bitslice implementation of JH for 32-bit platform (with 8 128-bit XMM registers).
-
-   -----------------------------------------
-   Performance:
-
-   Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
-   Operating System: 32-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
-   Speed for long message:
-   1) 23.6 cycles/byte   compiler: Intel C++ Compiler 11.1   compilation option: icc -O2
-   2) 24.1 cycles/byte   compiler: gcc 4.4.3                 compilation option: gcc -msse2 -O3
-
-   ------------------------------------------
-   Comparing with the original JH sse2 code for 32-bit platform, the following modifications are made:
-   a) The Sbox implementation follows exactly the description given in the document
-   b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler
-   c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily)
-   d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code)
-   e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size.
-   ------------------------------------------
-
-   Last Modified: January 16, 2011
-*/
-
-
-
-#include <emmintrin.h>
-#include <string.h>
-
-typedef unsigned int  uint32;
-typedef __m128i  word128;     /*word128 defines a 128-bit SSE2 word*/
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
-typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
-
-/*define data alignment for different C compilers*/
-#if defined(__GNUC__)
-      #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
-#else
-      #define DATA_ALIGN16(x) __declspec(align(16)) x
-#endif
-
-typedef struct {
-      int hashbitlen;	                  /*the message digest size*/
-      unsigned long long databitlen;      /*the message size in bits*/
-      unsigned long long datasize_in_buffer;           /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
-      word128  x0,x1,x2,x3,x4,x5,x6,x7;   /*1024-bit state;*/
-      unsigned char buffer[64];           /*512-bit message block;*/
-} hashState;
-
-/*The initial hash value H(0)*/
-DATA_ALIGN16(const unsigned char JH224_H0[128])={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e};
-DATA_ALIGN16(const unsigned char JH256_H0[128])={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69};
-DATA_ALIGN16(const unsigned char JH384_H0[128])={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f};
-DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
-
-/*42 round constants, each round constant is 32-byte (256-bit)*/
-DATA_ALIGN16(const unsigned char E8_bitslice_roundconstant[42][32])={
-{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
-{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
-{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
-{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
-{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
-{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
-{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
-{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
-{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
-{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
-{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
-{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
-{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
-{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
-{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
-{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
-{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
-{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
-{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
-{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
-{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
-{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
-{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
-{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
-{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
-{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
-{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
-{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
-{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
-{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
-{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
-{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
-{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
-{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
-{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
-{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
-{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
-{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
-{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
-{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
-{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
-{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
-
-
-void F8(hashState *state);  /* the compression function F8 */
-
-/*The API functions*/
-HashReturn Init(hashState *state, int hashbitlen);
-HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
-HashReturn Final(hashState *state, BitSequence *hashval);
-HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval);
-
-/*The following defines operations on 128-bit word(s)*/
-#define CONSTANT(b)   _mm_set1_epi8((b))          /*set each byte in a 128-bit register to be "b"*/
-
-#define XOR(x,y)      _mm_xor_si128((x),(y))      /*XOR(x,y) = x ^ y, where x and y are two 128-bit word*/
-#define AND(x,y)      _mm_and_si128((x),(y))      /*AND(x,y) = x & y, where x and y are two 128-bit word*/
-#define ANDNOT(x,y)   _mm_andnot_si128((x),(y))   /*ANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/
-#define OR(x,y)       _mm_or_si128((x),(y))       /*OR(x,y)  = x | y, where x and y are two 128-bit word*/
-
-#define SHR1(x)       _mm_srli_epi16((x), 1)      /*SHR1(x)  = x >> 1, where x is a 128 bit word*/
-#define SHR2(x)       _mm_srli_epi16((x), 2)      /*SHR2(x)  = x >> 2, where x is a 128 bit word*/
-#define SHR4(x)       _mm_srli_epi16((x), 4)      /*SHR4(x)  = x >> 4, where x is a 128 bit word*/
-#define SHR8(x)       _mm_slli_epi16((x), 8)      /*SHR8(x)  = x >> 8, where x is a 128 bit word*/
-#define SHR16(x)      _mm_slli_epi32((x), 16)     /*SHR16(x) = x >> 16, where x is a 128 bit word*/
-#define SHR32(x)      _mm_slli_epi64((x), 32)     /*SHR32(x) = x >> 32, where x is a 128 bit word*/
-#define SHR64(x)      _mm_slli_si128((x), 8)      /*SHR64(x) = x >> 64, where x is a 128 bit word*/
-
-#define SHL1(x)       _mm_slli_epi16((x), 1)      /*SHL1(x)  = x << 1, where x is a 128 bit word*/
-#define SHL2(x)       _mm_slli_epi16((x), 2)	  /*SHL2(x)  = x << 2, where x is a 128 bit word*/
-#define SHL4(x)       _mm_slli_epi16((x), 4)	  /*SHL4(x)  = x << 4, where x is a 128 bit word*/
-#define SHL8(x)       _mm_srli_epi16((x), 8)	  /*SHL8(x)  = x << 8, where x is a 128 bit word*/
-#define SHL16(x)      _mm_srli_epi32((x), 16)	  /*SHL16(x) = x << 16, where x is a 128 bit word*/
-#define SHL32(x)      _mm_srli_epi64((x), 32)	  /*SHL32(x) = x << 32, where x is a 128 bit word*/
-#define SHL64(x)      _mm_srli_si128((x), 8)	  /*SHL64(x) = x << 64, where x is a 128 bit word*/
-
-#define SWAP1(x)      OR(SHR1(AND((x),CONSTANT(0xaa))),SHL1(AND((x),CONSTANT(0x55))))  /*swapping bit 2i with bit 2i+1 of the 128-bit x */
-#define SWAP2(x)      OR(SHR2(AND((x),CONSTANT(0xcc))),SHL2(AND((x),CONSTANT(0x33))))  /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */
-#define SWAP4(x)      OR(SHR4(AND((x),CONSTANT(0xf0))),SHL4(AND((x),CONSTANT(0xf))))   /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */
-#define SWAP8(x)      OR(SHR8(x),SHL8(x))                          /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */
-#define SWAP16(x)     OR(SHR16(x),SHL16(x))                        /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */
-#define SWAP32(x)     _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1))  /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/
-#define SWAP64(x)     _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2))  /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/
-
-#define STORE(x,p)    _mm_store_si128((__m128i *)(p), (x))         /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/
-#define LOAD(p)       _mm_load_si128((__m128i *)(p))               /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/
-
-/*The MDS code*/
-#define L(m0,m1,m2,m3,m4,m5,m6,m7)     \
-      (m4) = XOR((m4),(m1));           \
-      (m5) = XOR((m5),(m2));           \
-      (m6) = XOR(XOR((m6),(m3)),(m0)); \
-      (m7) = XOR((m7),(m0));           \
-      (m0) = XOR((m0),(m5));           \
-      (m1) = XOR((m1),(m6));           \
-      (m2) = XOR(XOR((m2),(m7)),(m4)); \
-      (m3) = XOR((m3),(m4));
-
-/*The Sbox, it implements S0 and S1, selected by a constant bit*/
-#define S(m0,m1,m2,m3,c0)              \
-      m3 = XOR(m3,CONSTANT(0xff));     \
-      m0 = XOR(m0,ANDNOT(m2,c0));      \
-      temp0 = XOR(c0,AND(m0,m1));      \
-      m0 = XOR(m0,AND(m3,m2));         \
-      m3 = XOR(m3,ANDNOT(m1,m2));      \
-      m1 = XOR(m1,AND(m0,m2));         \
-      m2 = XOR(m2,ANDNOT(m3,m0));      \
-      m0 = XOR(m0,OR(m1,m3));          \
-      m3 = XOR(m3,AND(m1,m2));         \
-      m2 = XOR(m2,temp0);              \
-      m1 = XOR(m1,AND(temp0,m0));
-
-/* The linear transform of the (7i+0)th round*/
-#define lineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                                          \
-      L(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \
-      m4 = SWAP1(m4); m5 = SWAP1(m5); m6 = SWAP1(m6); m7 = SWAP1(m7);
-
-/* The linear transform of the (7i+1)th round*/
-#define lineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                                          \
-      L(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */  \
-      m4 = SWAP2(m4); m5 = SWAP2(m5); m6 = SWAP2(m6); m7 = SWAP2(m7);
-
-/* The linear transform of the (7i+2)th round*/
-#define lineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                        \
-      L(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/      \
-      m4 = SWAP4(m4); m5 = SWAP4(m5); m6 = SWAP4(m6); m7 = SWAP4(m7);
-
-/* The linear transform of the (7i+3)th round*/
-#define lineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                        \
-      L(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/  \
-      m4 = SWAP8(m4); m5 = SWAP8(m5); m6 = SWAP8(m6); m7 = SWAP8(m7);
-
-/* The linear transform of the (7i+4)th round*/
-#define lineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7)  \
-      /*MDS layer*/                 \
-      L(m0,m1,m2,m3,m4,m5,m6,m7);                     \
-      /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/  \
-      m4 = SWAP16(m4); m5 = SWAP16(m5); m6 = SWAP16(m6); m7 = SWAP16(m7);
-
-/* The linear transform of the (7i+5)th round -- faster*/
-#define lineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7)  \
-      /*MDS layer*/                 \
-      L(m0,m1,m2,m3,m4,m5,m6,m7);                     \
-      /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/  \
-      m4 = SWAP32(m4); m5 = SWAP32(m5); m6 = SWAP32(m6); m7 = SWAP32(m7);
-
-/* The linear transform of the (7i+6)th round -- faster*/
-#define lineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7)  \
-      /*MDS layer*/                 \
-      L(m0,m1,m2,m3,m4,m5,m6,m7);                     \
-      /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/  \
-      m4 = SWAP64(m4); m5 = SWAP64(m5); m6 = SWAP64(m6); m7 = SWAP64(m7);
-
-/*the round function of E8 */
-#define round_function(nn,r)                                  \
-      S(y0,y2,y4,y6, LOAD(E8_bitslice_roundconstant[r]) );    \
-      S(y1,y3,y5,y7, LOAD(E8_bitslice_roundconstant[r]+16) ); \
-      lineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7);
-
-/*the compression function F8 */
-void F8(hashState *state)
-{
-      uint32 i;
-      word128  y0,y1,y2,y3,y4,y5,y6,y7;
-      word128  temp0;
-
-      y0 = state->x0;
-      y1 = state->x1;
-      y2 = state->x2;
-      y3 = state->x3;
-      y4 = state->x4;
-      y5 = state->x5;
-      y6 = state->x6;
-      y7 = state->x7;
-
-      /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
-
-      y0 = XOR(y0, LOAD(state->buffer));
-      y1 = XOR(y1, LOAD(state->buffer+16));
-      y2 = XOR(y2, LOAD(state->buffer+32));
-      y3 = XOR(y3, LOAD(state->buffer+48));
-
-      /*perform 42 rounds*/
-      for (i = 0; i < 42; i = i+7) {
-            round_function(00,i);
-            round_function(01,i+1);
-            round_function(02,i+2);
-            round_function(03,i+3);
-            round_function(04,i+4);
-            round_function(05,i+5);
-            round_function(06,i+6);
-      }
-
-      /*xor the 512-bit message with the second half of the 1024-bit hash state*/
-
-      y4 = XOR(y4, LOAD(state->buffer));
-      y5 = XOR(y5, LOAD(state->buffer+16));
-      y6 = XOR(y6, LOAD(state->buffer+32));
-      y7 = XOR(y7, LOAD(state->buffer+48));
-
-      state->x0 = y0;
-      state->x1 = y1;
-      state->x2 = y2;
-      state->x3 = y3;
-      state->x4 = y4;
-      state->x5 = y5;
-      state->x6 = y6;
-      state->x7 = y7;
-}
-
-/*before hashing a message, initialize the hash state as H0 */
-HashReturn Init(hashState *state, int hashbitlen)
-{
-
-      state->databitlen = 0;
-      state->datasize_in_buffer = 0;
-
-      state->hashbitlen = hashbitlen;
-
-      /*initialize the initial hash value of JH*/
-      /*load the intital hash value into state*/
-
-      switch(hashbitlen)
-      {
-          case 224:
-              state->x0 = LOAD(JH224_H0);
-              state->x1 = LOAD(JH224_H0+16);
-              state->x2 = LOAD(JH224_H0+32);
-              state->x3 = LOAD(JH224_H0+48);
-              state->x4 = LOAD(JH224_H0+64);
-              state->x5 = LOAD(JH224_H0+80);
-              state->x6 = LOAD(JH224_H0+96);
-              state->x7 = LOAD(JH224_H0+112);
-              break;
-
-          case 256:
-              state->x0 = LOAD(JH256_H0);
-              state->x1 = LOAD(JH256_H0+16);
-              state->x2 = LOAD(JH256_H0+32);
-              state->x3 = LOAD(JH256_H0+48);
-              state->x4 = LOAD(JH256_H0+64);
-              state->x5 = LOAD(JH256_H0+80);
-              state->x6 = LOAD(JH256_H0+96);
-              state->x7 = LOAD(JH256_H0+112);
-              break;
-
-          case 384:
-              state->x0 = LOAD(JH384_H0);
-              state->x1 = LOAD(JH384_H0+16);
-              state->x2 = LOAD(JH384_H0+32);
-              state->x3 = LOAD(JH384_H0+48);
-              state->x4 = LOAD(JH384_H0+64);
-              state->x5 = LOAD(JH384_H0+80);
-              state->x6 = LOAD(JH384_H0+96);
-              state->x7 = LOAD(JH384_H0+112);
-              break;
-
-          case 512:
-              state->x0 = LOAD(JH512_H0);
-              state->x1 = LOAD(JH512_H0+16);
-              state->x2 = LOAD(JH512_H0+32);
-              state->x3 = LOAD(JH512_H0+48);
-              state->x4 = LOAD(JH512_H0+64);
-              state->x5 = LOAD(JH512_H0+80);
-              state->x6 = LOAD(JH512_H0+96);
-              state->x7 = LOAD(JH512_H0+112);
-              break;
-      }
-
-      return(SUCCESS);
-}
-
-/*hash each 512-bit message block, except the last partial block*/
-HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
-{
-      DataLength index; /*the starting address of the data to be compressed*/
-
-      state->databitlen += databitlen;
-      index = 0;
-
-      /*if there is remaining data in the buffer, fill it to a full message block first*/
-      /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
-
-      /*There is data in the buffer, but the incoming data is insufficient for a full block*/
-      if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  ) {
-            if ( (databitlen & 7) == 0 ) {
-                 memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ;
-		    }
-            else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ;
-            state->datasize_in_buffer += databitlen;
-            databitlen = 0;
-      }
-
-      /*There is data in the buffer, and the incoming data is sufficient for a full block*/
-      if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
-	        memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
-	        index = 64-(state->datasize_in_buffer >> 3);
-	        databitlen = databitlen - (512 - state->datasize_in_buffer);
-	        F8(state);
-	        state->datasize_in_buffer = 0;
-      }
-
-      /*hash the remaining full message blocks*/
-      for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
-            memcpy(state->buffer, data+index, 64);
-            F8(state);
-      }
-
-      /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
-      if ( databitlen > 0) {
-            if ((databitlen & 7) == 0)
-                  memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
-            else
-                  memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
-            state->datasize_in_buffer = databitlen;
-      }
-
-      return(SUCCESS);
-}
-
-/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
-HashReturn Final(hashState *state, BitSequence *hashval)
-{
-	  unsigned int i;
-      DATA_ALIGN16(unsigned char t[64]);
-
-      if ( (state->databitlen & 0x1ff) == 0 )
-      {
-          /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
-          memset(state->buffer,0,64);
-          state->buffer[0] = 0x80;
-          state->buffer[63] = state->databitlen & 0xff;
-          state->buffer[62] = (state->databitlen >> 8) & 0xff;
-          state->buffer[61] = (state->databitlen >> 16) & 0xff;
-          state->buffer[60] = (state->databitlen >> 24) & 0xff;
-          state->buffer[59] = (state->databitlen >> 32) & 0xff;
-          state->buffer[58] = (state->databitlen >> 40) & 0xff;
-          state->buffer[57] = (state->databitlen >> 48) & 0xff;
-          state->buffer[56] = (state->databitlen >> 56) & 0xff;
-          F8(state);
-      }
-      else  {
-		  /*set the rest of the bytes in the buffer to 0*/
-	      if ( (state->datasize_in_buffer & 7) == 0)
-	           for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++)  state->buffer[i] = 0;
-		  else
-               for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++)  state->buffer[i] = 0;
-
-          /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
-          state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
-          F8(state);
-          memset(state->buffer,0,64);
-          state->buffer[63] = state->databitlen & 0xff;
-          state->buffer[62] = (state->databitlen >> 8) & 0xff;
-          state->buffer[61] = (state->databitlen >> 16) & 0xff;
-          state->buffer[60] = (state->databitlen >> 24) & 0xff;
-          state->buffer[59] = (state->databitlen >> 32) & 0xff;
-          state->buffer[58] = (state->databitlen >> 40) & 0xff;
-          state->buffer[57] = (state->databitlen >> 48) & 0xff;
-          state->buffer[56] = (state->databitlen >> 56) & 0xff;
-          F8(state);
-      }
-
-      /*truncting the final hash value to generate the message digest*/
-
-      STORE(state->x4,t);
-      STORE(state->x5,t+16);
-      STORE(state->x6,t+32);
-      STORE(state->x7,t+48);
-
-      switch (state->hashbitlen)
-      {
-          case 224: memcpy(hashval,t+36,28); break;
-          case 256: memcpy(hashval,t+32,32); break;
-          case 384: memcpy(hashval,t+16,48); break;
-          case 512: memcpy(hashval,t,64);    break;
-      }
-
-      return(SUCCESS);
-}
-
-/* hash a message,
-   three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen)
-   one output:   message digest (hashval)
-*/
-HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval)
-{
-      hashState state;
-
-      if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 )
-      {
-          Init(&state, hashbitlen);
-          Update(&state, data, databitlen);
-          Final(&state, hashval);
-          return SUCCESS;
-      }
-      else
-          return(BAD_HASHLEN);
-}
--- a/algo/jh/sse2/jh_sse2_opt64.h
+++ b/algo/jh/sse2/jh_sse2_opt64.h
@@ -1,357 +0,0 @@
-/*This program gives the optimized SSE2 bitslice implementation of JH for 64-bit platform (with 16 128-bit XMM registers).
-
-   --------------------------------
-   Performance
-
-   Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
-   Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
-   Speed for long message:
-   1) 19.9 cycles/byte   compiler: Intel C++ Compiler 11.1   compilation option: icc -O3
-   2) 20.9 cycles/byte   compiler: gcc 4.4.3                 compilation option: gcc -msse2 -O3
-
-   --------------------------------
-   Compare with the original JH sse2 code (October 2008) for 64-bit platform, we made the modifications:
-   a) The Sbox implementation follows exactly the description given in the document
-   b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler
-   c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily)
-   d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code)
-   e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size.
-
-   --------------------------------
-   Last Modified: January 16, 2011
-*/
-
-
-#include <emmintrin.h>
-#include <stdint.h>
-#include <string.h>
-#include "algo/sha/sha3-defs.h"
-
-typedef __m128i  word128;   /*word128 defines a 128-bit SSE2 word*/
-typedef enum {jhSUCCESS = 0, jhFAIL = 1, jhBAD_HASHLEN = 2} jhReturn;
-
-/*define data alignment for different C compilers*/
-#if defined(__GNUC__)
-      #define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
-#else
-      #define DATA_ALIGN16(x) __declspec(align(16)) x
-#endif
-
-typedef struct {
-      DataLength jhbitlen;	                /*the message digest size*/
-      DataLength databitlen;    /*the message size in bits*/
-      DataLength datasize_in_buffer;           /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
-      word128  x0,x1,x2,x3,x4,x5,x6,x7; /*1024-bit state;*/
-      unsigned char buffer[64];         /*512-bit message block;*/
-} jhState;
-
-#define DECL_JH \
-    word128 jhSx0,jhSx1,jhSx2,jhSx3,jhSx4,jhSx5,jhSx6,jhSx7; \
-    unsigned char jhSbuffer[64];
-
-
-/*The initial hash value H(0)*/
-static DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
-
-/*42 round constants, each round constant is 32-byte (256-bit)*/
-static DATA_ALIGN16(const unsigned char jhE8_bitslice_roundconstant[42][32])={
-{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
-{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
-{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
-{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
-{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
-{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
-{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
-{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
-{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
-{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
-{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
-{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
-{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
-{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
-{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
-{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
-{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
-{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
-{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
-{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
-{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
-{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
-{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
-{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
-{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
-{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
-{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
-{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
-{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
-{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
-{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
-{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
-{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
-{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
-{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
-{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
-{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
-{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
-{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
-{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
-{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
-{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
-
-
-//static void jhF8(jhState *state);    /* the compression function F8 */
-
-/*The API functions*/
-
-/*The following defines operations on 128-bit word(s)*/
-#define jhCONSTANT(b)   _mm_set1_epi8((b))          /*set each byte in a 128-bit register to be "b"*/
-
-#define jhXOR(x,y)      _mm_xor_si128((x),(y))      /*jhXOR(x,y) = x ^ y, where x and y are two 128-bit word*/
-#define jhAND(x,y)      _mm_and_si128((x),(y))      /*jhAND(x,y) = x & y, where x and y are two 128-bit word*/
-#define jhANDNOT(x,y)   _mm_andnot_si128((x),(y))   /*jhANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/
-#define jhOR(x,y)       _mm_or_si128((x),(y))       /*jhOR(x,y)  = x | y, where x and y are two 128-bit word*/
-
-#define jhSHR1(x)       _mm_srli_epi16((x), 1)      /*jhSHR1(x)  = x >> 1, where x is a 128 bit word*/
-#define jhSHR2(x)       _mm_srli_epi16((x), 2)      /*jhSHR2(x)  = x >> 2, where x is a 128 bit word*/
-#define jhSHR4(x)       _mm_srli_epi16((x), 4)      /*jhSHR4(x)  = x >> 4, where x is a 128 bit word*/
-#define jhSHR8(x)       _mm_slli_epi16((x), 8)      /*jhSHR8(x)  = x >> 8, where x is a 128 bit word*/
-#define jhSHR16(x)      _mm_slli_epi32((x), 16)     /*jhSHR16(x) = x >> 16, where x is a 128 bit word*/
-#define jhSHR32(x)      _mm_slli_epi64((x), 32)     /*jhSHR32(x) = x >> 32, where x is a 128 bit word*/
-#define jhSHR64(x)      _mm_slli_si128((x), 8)      /*jhSHR64(x) = x >> 64, where x is a 128 bit word*/
-
-#define jhSHL1(x)       _mm_slli_epi16((x), 1)      /*jhSHL1(x)  = x << 1, where x is a 128 bit word*/
-#define jhSHL2(x)       _mm_slli_epi16((x), 2)	  /*jhSHL2(x)  = x << 2, where x is a 128 bit word*/
-#define jhSHL4(x)       _mm_slli_epi16((x), 4)	  /*jhSHL4(x)  = x << 4, where x is a 128 bit word*/
-#define jhSHL8(x)       _mm_srli_epi16((x), 8)	  /*jhSHL8(x)  = x << 8, where x is a 128 bit word*/
-#define jhSHL16(x)      _mm_srli_epi32((x), 16)	  /*jhSHL16(x) = x << 16, where x is a 128 bit word*/
-#define jhSHL32(x)      _mm_srli_epi64((x), 32)	  /*jhSHL32(x) = x << 32, where x is a 128 bit word*/
-#define jhSHL64(x)      _mm_srli_si128((x), 8)	  /*jhSHL64(x) = x << 64, where x is a 128 bit word*/
-
-#define jhSWAP1(x)      jhOR(jhSHR1(jhAND((x),jhCONSTANT(0xaa))),jhSHL1(jhAND((x),jhCONSTANT(0x55))))  /*swapping bit 2i with bit 2i+1 of the 128-bit x */
-#define jhSWAP2(x)      jhOR(jhSHR2(jhAND((x),jhCONSTANT(0xcc))),jhSHL2(jhAND((x),jhCONSTANT(0x33))))  /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */
-#define jhSWAP4(x)      jhOR(jhSHR4(jhAND((x),jhCONSTANT(0xf0))),jhSHL4(jhAND((x),jhCONSTANT(0xf))))   /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */
-#define jhSWAP8(x)      jhOR(jhSHR8(x),jhSHL8(x))                          /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */
-#define jhSWAP16(x)     jhOR(jhSHR16(x),jhSHL16(x))                        /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */
-#define jhSWAP32(x)     _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1))  /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/
-#define jhSWAP64(x)     _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2))  /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/
-#define jhSTORE(x,p)    _mm_store_si128((__m128i *)(p), (x))         /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/
-#define jhLOAD(p)       _mm_load_si128((__m128i *)(p))               /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/
-
-/*The MDS code*/
-#define jhL(m0,m1,m2,m3,m4,m5,m6,m7)     \
-      (m4) = jhXOR((m4),(m1));           \
-      (m5) = jhXOR((m5),(m2));           \
-      (m6) = jhXOR(jhXOR((m6),(m3)),(m0)); \
-      (m7) = jhXOR((m7),(m0));           \
-      (m0) = jhXOR((m0),(m5));           \
-      (m1) = jhXOR((m1),(m6));           \
-      (m2) = jhXOR(jhXOR((m2),(m7)),(m4)); \
-      (m3) = jhXOR((m3),(m4));
-
-/*Two Sboxes computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/
-/*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power of SSE2 instructions*/
-#define jhSS(m0,m1,m2,m3,m4,m5,m6,m7,constant0,constant1)  \
-      m3 = jhXOR(m3,jhCONSTANT(0xff));       \
-      m7 = jhXOR(m7,jhCONSTANT(0xff));       \
-      m0 = jhXOR(m0,jhANDNOT(m2,constant0)); \
-      m4 = jhXOR(m4,jhANDNOT(m6,constant1)); \
-      a0 = jhXOR(constant0,jhAND(m0,m1));    \
-      a1 = jhXOR(constant1,jhAND(m4,m5));    \
-      m0 = jhXOR(m0,jhAND(m3,m2));           \
-      m4 = jhXOR(m4,jhAND(m7,m6));           \
-      m3 = jhXOR(m3,jhANDNOT(m1,m2));        \
-      m7 = jhXOR(m7,jhANDNOT(m5,m6));        \
-      m1 = jhXOR(m1,jhAND(m0,m2));           \
-      m5 = jhXOR(m5,jhAND(m4,m6));           \
-      m2 = jhXOR(m2,jhANDNOT(m3,m0));        \
-      m6 = jhXOR(m6,jhANDNOT(m7,m4));        \
-      m0 = jhXOR(m0,jhOR(m1,m3));            \
-      m4 = jhXOR(m4,jhOR(m5,m7));            \
-      m3 = jhXOR(m3,jhAND(m1,m2));           \
-      m7 = jhXOR(m7,jhAND(m5,m6));           \
-      m2 = jhXOR(m2,a0);                   \
-      m6 = jhXOR(m6,a1);                   \
-      m1 = jhXOR(m1,jhAND(a0,m0));           \
-      m5 = jhXOR(m5,jhAND(a1,m4));
-
-/* The linear transform of the (7*i+0)th round*/
-#define jhlineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                                          \
-      jhL(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \
-      m4 = jhSWAP1(m4); m5 = jhSWAP1(m5); m6 = jhSWAP1(m6); m7 = jhSWAP1(m7);
-
-/* The linear transform of the (7*i+1)th round*/
-#define jhlineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                                          \
-      jhL(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */  \
-      m4 = jhSWAP2(m4); m5 = jhSWAP2(m5); m6 = jhSWAP2(m6); m7 = jhSWAP2(m7);
-
-/* The linear transform of the (7*i+2)th round*/
-#define jhlineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                                          \
-      jhL(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/      \
-      m4 = jhSWAP4(m4); m5 = jhSWAP4(m5); m6 = jhSWAP4(m6); m7 = jhSWAP4(m7);
-
-/* The linear transform of the (7*i+3)th round*/
-#define jhlineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7)         \
-      /*MDS layer*/                                          \
-      jhL(m0,m1,m2,m3,m4,m5,m6,m7);                            \
-      /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/  \
-      m4 = jhSWAP8(m4); m5 = jhSWAP8(m5); m6 = jhSWAP8(m6); m7 = jhSWAP8(m7);
-
-/* The linear transform of the (7*i+4)th round*/
-#define jhlineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7)  \
-      /*MDS layer*/                                   \
-      jhL(m0,m1,m2,m3,m4,m5,m6,m7);                     \
-      /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/  \
-      m4 = jhSWAP16(m4); m5 = jhSWAP16(m5); m6 = jhSWAP16(m6); m7 = jhSWAP16(m7);
-
-/* The linear transform of the (7*i+5)th round -- faster*/
-#define jhlineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7)  \
-      /*MDS layer*/                                   \
-      jhL(m0,m1,m2,m3,m4,m5,m6,m7);                     \
-      /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/  \
-      m4 = jhSWAP32(m4); m5 = jhSWAP32(m5); m6 = jhSWAP32(m6); m7 = jhSWAP32(m7);
-
-/* The linear transform of the (7*i+6)th round -- faster*/
-#define jhlineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7)  \
-      /*MDS layer*/                                   \
-      jhL(m0,m1,m2,m3,m4,m5,m6,m7);                     \
-      /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/  \
-      m4 = jhSWAP64(m4); m5 = jhSWAP64(m5); m6 = jhSWAP64(m6); m7 = jhSWAP64(m7);
-
-/*the round function of E8 */
-#define jhround_function(nn,r)                                                              \
-      jhSS(y0,y2,y4,y6,y1,y3,y5,y7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \
-      jhlineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7);
-
-/*the round function of E8 */
-#define jhround_functionI(nn,r)                                                              \
-      jhSS(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \
-      jhlineartransform_R##nn(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7);
-
-/*
-//the compression function F8
-static void jhF8(jhState *state)
-{
-    return;
-      uint64_t i;
-      word128  y0,y1,y2,y3,y4,y5,y6,y7;
-      word128  a0,a1;
-
-      y0 = state->x0,
-      y0 = jhXOR(y0, jhLOAD(state->buffer));
-      y1 = state->x1,
-      y1 = jhXOR(y1, jhLOAD(state->buffer+16));
-      y2 = state->x2,
-      y2 = jhXOR(y2, jhLOAD(state->buffer+32));
-      y3 = state->x3,
-      y3 = jhXOR(y3, jhLOAD(state->buffer+48));
-      y4 = state->x4;
-      y5 = state->x5;
-      y6 = state->x6;
-      y7 = state->x7;
-
-      //xor the 512-bit message with the fist half of the 1024-bit hash state
-
-      //perform 42 rounds
-      for (i = 0; i < 42; i = i+7) {
-            jhround_function(00,i);
-            jhround_function(01,i+1);
-            jhround_function(02,i+2);
-            jhround_function(03,i+3);
-            jhround_function(04,i+4);
-            jhround_function(05,i+5);
-            jhround_function(06,i+6);
-      }
-
-      //xor the 512-bit message with the second half of the 1024-bit hash state
-
-      state->x0 = y0;
-      state->x1 = y1;
-      state->x2 = y2;
-      state->x3 = y3;
-      y4 = jhXOR(y4, jhLOAD(state->buffer)),
-      state->x4 = y4;
-      y5 = jhXOR(y5, jhLOAD(state->buffer+16)),
-      state->x5 = y5;
-      y6 = jhXOR(y6, jhLOAD(state->buffer+32)),
-      state->x6 = y6;
-      y7 = jhXOR(y7, jhLOAD(state->buffer+48)),
-      state->x7 = y7;
-}
-*/
-
-#define jhF8I \
-do { \
-      uint64_t i; \
-      word128  a0,a1; \
-      jhSx0 = jhXOR(jhSx0, jhLOAD(jhSbuffer)); \
-      jhSx1 = jhXOR(jhSx1, jhLOAD(jhSbuffer+16)); \
-      jhSx2 = jhXOR(jhSx2, jhLOAD(jhSbuffer+32)); \
-      jhSx3 = jhXOR(jhSx3, jhLOAD(jhSbuffer+48)); \
-      for (i = 0; i < 42; i = i+7) { \
-            jhround_functionI(00,i); \
-            jhround_functionI(01,i+1); \
-            jhround_functionI(02,i+2); \
-            jhround_functionI(03,i+3); \
-            jhround_functionI(04,i+4); \
-            jhround_functionI(05,i+5); \
-            jhround_functionI(06,i+6); \
-      } \
-      jhSx4 = jhXOR(jhSx4, jhLOAD(jhSbuffer)); \
-      jhSx5 = jhXOR(jhSx5, jhLOAD(jhSbuffer+16)); \
-      jhSx6 = jhXOR(jhSx6, jhLOAD(jhSbuffer+32)); \
-      jhSx7 = jhXOR(jhSx7, jhLOAD(jhSbuffer+48)); \
-} while (0)
-
-/* the whole thing 
- * load from hash 
- * hash = JH512(loaded)
- */
-#define JH_H \
-do { \
-    jhSx0 = jhLOAD(JH512_H0); \
-    jhSx1 = jhLOAD(JH512_H0+16); \
-    jhSx2 = jhLOAD(JH512_H0+32); \
-    jhSx3 = jhLOAD(JH512_H0+48); \
-    jhSx4 = jhLOAD(JH512_H0+64); \
-    jhSx5 = jhLOAD(JH512_H0+80); \
-    jhSx6 = jhLOAD(JH512_H0+96); \
-    jhSx7 = jhLOAD(JH512_H0+112); \
-    /* for break loop */ \
-    /* one inlined copy of JHF8i */ \
-    int b = false; \
-    memcpy(jhSbuffer, hash, 64); \
-    for(;;) { \
-        jhF8I; \
-        if (b) break; \
-        memset(jhSbuffer,0,48); \
-        jhSbuffer[0] = 0x80; \
-        jhSbuffer[48] = 0x00, \
-        jhSbuffer[49] = 0x00, \
-        jhSbuffer[50] = 0x00, \
-        jhSbuffer[51] = 0x00, \
-        jhSbuffer[52] = 0x00, \
-        jhSbuffer[53] = 0x00, \
-        jhSbuffer[54] = 0x00, \
-        jhSbuffer[55] = 0x00; \
-        jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
-        jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
-        jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
-        jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
-        jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
-        jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
-        jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
-        jhSbuffer[63] = (64*8) & 0xff; \
-        b = true; \
-    } \
-jhSTORE(jhSx4,(char *)(hash)); \
-jhSTORE(jhSx5,(char *)(hash)+16); \
-jhSTORE(jhSx6,(char *)(hash)+32); \
-jhSTORE(jhSx7,(char *)(hash)+48); \
-} while (0) 
-
--- a/algo/jh/sse2/sph_jh.h
+++ b/algo/jh/sse2/sph_jh.h
@@ -1,127 +0,0 @@
-/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * JH interface. JH is a family of functions which differ by
- * their output size; this implementation defines JH for output
- * sizes 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_jh.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_JH_H__
-#define SPH_JH_H__
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#include "sph_types.h"
-
-#define QSTATIC static 
-
-/**
- * Output size (in bits) for JH-512.
- */
-#define SPH_SIZE_jh512   512
-
-/**
- * This structure is a context for JH computations: it contains the
- * intermediate values and some data from the last entered block. Once
- * a JH computation has been performed, the context can be reused for
- * another computation.
- *
- * The contents of this structure are private. A running JH computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	size_t ptr;
-	union {
-		sph_u64 wide[16];
-		sph_u32 narrow[32];
-	} H;
-	sph_u64 block_count;
-} sph_jh_context;
-
-/**
- * Type for a JH-512 context (identical to the common context).
- */
-typedef sph_jh_context sph_jh512_context;
-
-/**
- * Initialize a JH-512 context. This process performs no memory allocation.
- *
- * @param cc   the JH-512 context (pointer to a
- *             <code>sph_jh512_context</code>)
- */
-QSTATIC void sph_jh512_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the JH-512 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-QSTATIC void sph_jh512(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current JH-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the JH-512 context
- * @param dst   the destination buffer
- */
-QSTATIC void sph_jh512_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the JH-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-QSTATIC void sph_jh512_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -28,26 +28,28 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  
+   const int thr_id = mythr->id;  
+   const bool bench = opt_benchmark;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
   do {
-       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
-
      keccakhash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg ) 
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) 
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          if ( valid_hash( lane_hash, ptarget ) )
          {
-              pdata[19] = n + lane;
+              pdata[19] = bswap_32( n + lane );
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
      n += 8;

   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
@@ -79,27 +81,28 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-	
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          if ( valid_hash( lane_hash, ptarget ))
          {
-              pdata[19] = n + lane;
+              pdata[19] = bswap_32( n + lane );
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);

   *hashes_done = n - first_nonce + 1;
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,5 +1,9 @@
 #include "keccak-gate.h"
+#include "sph_keccak.h"

+int hard_coded_eb = 1;
+
+// KECCAK

 bool register_keccak_algo( algo_gate_t* gate )
 {
@@ -19,6 +23,8 @@ bool register_keccak_algo( algo_gate_t* gate )
  return true;
 };

+// KECCAKC
+
 bool register_keccakc_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | AVX512_OPT;
@@ -37,3 +43,50 @@ bool register_keccakc_algo( algo_gate_t* gate )
  return true;
 };

+// SHA3D
+
+void sha3d( void *state, const void *input, int len )
+{
+	uint32_t _ALIGN(64) buffer[16], hash[16];
+	sph_keccak_context ctx_keccak;
+
+	sph_keccak256_init( &ctx_keccak );
+	sph_keccak256 ( &ctx_keccak, input, len );
+	sph_keccak256_close( &ctx_keccak, (void*) buffer );
+
+   sph_keccak256_init( &ctx_keccak );
+	sph_keccak256 ( &ctx_keccak, buffer, 32 );
+	sph_keccak256_close( &ctx_keccak, (void*) hash );
+
+	memcpy(state, hash, 32);
+}
+
+void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
+{
+  sha3d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size );
+  for ( int i = 0; i < sctx->job.merkle_count; i++ )
+  {
+     memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
+     sha256d( merkle_root, merkle_root, 64 );
+  }
+}
+
+bool register_sha3d_algo( algo_gate_t* gate )
+{
+  hard_coded_eb = 6;
+  opt_extranonce = false;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
+#if defined (KECCAK_8WAY)
+  gate->scanhash  = (void*)&scanhash_sha3d_8way;
+  gate->hash      = (void*)&sha3d_hash_8way;
+#elif defined (KECCAK_4WAY)
+  gate->scanhash  = (void*)&scanhash_sha3d_4way;
+  gate->hash      = (void*)&sha3d_hash_4way;
+#else
+  gate->scanhash  = (void*)&scanhash_sha3d;
+  gate->hash      = (void*)&sha3d_hash;
+#endif
+  return true;
+};
+
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -10,24 +10,37 @@
  #define KECCAK_4WAY 1
 #endif

+extern int hard_coded_eb;
+
 #if defined(KECCAK_8WAY)

 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

+void sha3d_hash_8way( void *state, const void *input );
+int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
 #elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

+void sha3d_hash_4way( void *state, const void *input );
+int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
 #else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

-#endif
+void sha3d_hash( void *state, const void *input );
+int scanhash_sha3d( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );

 #endif
+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -1,6 +1,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include "keccak-hash-4way.h"
+#include "keccak-gate.h"

 static const uint64_t RC[] = {
        0x0000000000000001, 0x0000000000008082,
@@ -163,12 +164,12 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    unsigned eb;
    union {
       __m512i tmp[lim + 1];
-       sph_u64 dummy;   /* for alignment */
+       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m512_len = byte_len >> 3;

-    eb = 0x100  >> 8;
+    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
@@ -344,12 +345,12 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    unsigned eb;
    union {
       __m256i tmp[lim + 1];
-       sph_u64 dummy;   /* for alignment */
+       uint64_t dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m256_len = byte_len >> 3;

-    eb = 0x100  >> 8;
+    eb = hard_coded_eb;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -43,16 +43,8 @@ extern "C"{
 #ifdef  __AVX2__

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

-#define SPH_SIZE_keccak256   256
-
-/**
- * Output size (in bits) for Keccak-512.
- */
-#define SPH_SIZE_keccak512   512
-
 /**
 * This structure is a context for Keccak computations: it contains the
 * intermediate values and some data from the last entered block. Once a
@@ -99,14 +91,12 @@ typedef keccak64_ctx_m256i keccak512_4way_context;
 void keccak256_4way_init(void *cc);
 void keccak256_4way_update(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
-#define keccak256_4way keccak256_4way_update

 void keccak512_4way_init(void *cc);
 void keccak512_4way_update(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
-#define keccak512_4way keccak512_4way_update

 #endif

--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -18,36 +18,34 @@ void keccakhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_keccak( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_keccak( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	//const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t _ALIGN(64) hash64[8];
+   uint32_t _ALIGN(64) endiandata[32];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce;
+   const int thr_id = mythr->id;

-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );

-        for (int i=0; i < 19; i++) 
-                be32enc(&endiandata[i], pdata[i]);
+   do {
+      be32enc( &endiandata[19], n );
+      keccakhash( hash64, endiandata );
+      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );

-	do {
-	
-		pdata[19] = ++n;
-		be32enc(&endiandata[19], n); 
-		keccakhash(hash64, endiandata);
-        if (((hash64[7]&0xFFFFFF00)==0) && 
-				fulltest(hash64, ptarget)) {
-            *hashes_done = n - first_nonce + 1;
-			return true;
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
+   *hashes_done = n - first_nonce;
+   pdata[19] = n;
+   return 0;
 }

--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -0,0 +1,126 @@
+#include "keccak-gate.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "sph_keccak.h"
+#include "keccak-hash-4way.h"
+
+#if defined(KECCAK_8WAY)
+
+void sha3d_hash_8way(void *state, const void *input)
+{
+    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
+    keccak256_8way_context ctx;
+
+    keccak256_8way_init( &ctx );
+    keccak256_8way_update( &ctx, input, 80 );
+    keccak256_8way_close( &ctx, buffer );
+
+    keccak256_8way_init( &ctx );
+    keccak256_8way_update( &ctx, buffer, 32 );
+    keccak256_8way_close( &ctx, state );
+}
+
+int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash[16*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[49]);   // 3*16+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;  
+   const bool bench = opt_benchmark;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   *noncev = mm512_intrlv_blend_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
+   do {
+      sha3d_hash_8way( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) 
+      {
+          extr_lane_8x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      *noncev = _mm512_add_epi32( *noncev,
+                                  m512_const1_64( 0x0000000800000000 ) );
+      n += 8;
+
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(KECCAK_4WAY)
+
+void sha3d_hash_4way(void *state, const void *input)
+{
+    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
+    keccak256_4way_context ctx;
+
+    keccak256_4way_init( &ctx );
+    keccak256_4way_update( &ctx, input, 80 );
+    keccak256_4way_close( &ctx, buffer );
+
+    keccak256_4way_init( &ctx );
+    keccak256_4way_update( &ctx, buffer, 32 );
+    keccak256_4way_close( &ctx, state );
+}
+
+int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   *noncev = mm256_intrlv_blend_32( 
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
+   do {
+      sha3d_hash_4way( hash, vdata );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( valid_hash( lane_hash, ptarget ) )
+          {
+              pdata[19] = bswap_32( n + lane );
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      *noncev = _mm256_add_epi32( *noncev,
+                                  m256_const1_64( 0x0000000400000000 ) );
+      n += 4;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
--- a/algo/keccak/sha3d.c
+++ b/algo/keccak/sha3d.c
@@ -0,0 +1,50 @@
+#include "algo-gate-api.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "sph_keccak.h"
+
+void sha3d_hash(void *state, const void *input)
+{
+    uint32_t buffer[16];
+    sph_keccak256_context ctx_keccak;
+   
+    sph_keccak256_init( &ctx_keccak );
+    sph_keccak256 ( &ctx_keccak, input, 80 );
+    sph_keccak256_close( &ctx_keccak, buffer );
+    sph_keccak256_init( &ctx_keccak );
+    sph_keccak256 ( &ctx_keccak, buffer, 32 );
+    sph_keccak256_close( &ctx_keccak, state );
+}
+
+int scanhash_sha3d( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) hash64[8];
+   uint32_t _ALIGN(64) endiandata[32];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19];
+	const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce;
+   const int thr_id = mythr->id;
+
+   for ( int i=0; i < 19; i++ ) 
+      be32enc( &endiandata[i], pdata[i] );
+
+	do {
+		be32enc( &endiandata[19], n ); 
+		sha3d_hash( hash64, endiandata );
+      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n;
+         submit_solution( work, hash64, mythr );
+		}
+      n++;
+   } while ( n < last_nonce && !work_restart[thr_id].restart );
+	
+	*hashes_done = n - first_nonce;
+	pdata[19] = n;
+	return 0;
+}
+
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -32,8 +32,8 @@

 #include <stddef.h>
 #include <string.h>
-
 #include "sph_keccak.h"
+#include "keccak-gate.h"

 #ifdef __cplusplus
 extern "C"{
@@ -1616,7 +1616,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		} u; \
 		size_t j; \
 \
-		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
+		eb = hard_coded_eb; \
 		if (kc->ptr == (lim - 1)) { \
 			if (n == 7) { \
 				u.tmp[0] = eb; \
--- a/algo/keccak/sse2/keccak.c
+++ b/algo/keccak/sse2/keccak.c
@@ -1,845 +0,0 @@
-/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */
-/*
- * Keccak implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#define QSTATIC static
-
-#include <stddef.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "sph_keccak.h"
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-/*
- * Parameters:
- *
- *  SPH_KECCAK_64          use a 64-bit type
- *  SPH_KECCAK_INTERLEAVE  use bit-interleaving (32-bit type only)
- *  SPH_KECCAK_NOCOPY      do not copy the state into local variables
- * 
- * If there is no usable 64-bit type, the code automatically switches
- * back to the 32-bit implementation.
- *
- * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1
- * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core
- * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302,
- * 8 kB L1 code cache), seem to show that the following are optimal:
- *
- * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds,
- * do not copy the state; unrolling 2, 6 or all rounds also provides
- * near-optimal performance.
- * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds,
- * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds
- * also provides near-optimal performance.
- * -- PowerPC: use the 64-bit implementation, unroll 8 rounds,
- * copy the state. Unrolling 4 or 6 rounds is near-optimal.
- * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds,
- * copy the state.
- * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy
- * the state. Unrolling only 1 round is also near-optimal.
- *
- * Also, interleaving does not always yield actual improvements when
- * using a 32-bit implementation; in particular when the architecture
- * does not offer a native rotation opcode (interleaving replaces one
- * 64-bit rotation with two 32-bit rotations, which is a gain only if
- * there is a native 32-bit rotation opcode and not a native 64-bit
- * rotation opcode; also, interleaving implies a small overhead when
- * processing input words).
- *
- * To sum up:
- * -- when possible, use the 64-bit code
- * -- exception: on 32-bit x86, use 32-bit code
- * -- when using 32-bit code, use interleaving
- * -- copy the state, except on x86
- * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines
- */
-
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-/*
-static const sph_u64 RC[] = {
-	SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
-	SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
-	SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
-	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
-	SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
-	SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
-	SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
-	SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
-	SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
-	SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
-	SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
-	SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
-};
-*/
-#define kekDECL_STATE \
-	sph_u64 keca00, keca01, keca02, keca03, keca04; \
-	sph_u64 keca10, keca11, keca12, keca13, keca14; \
-	sph_u64 keca20, keca21, keca22, keca23, keca24; \
-	sph_u64 keca30, keca31, keca32, keca33, keca34; \
-	sph_u64 keca40, keca41, keca42, keca43, keca44;
-
-#define kekREAD_STATE(state)   do { \
-		keca00 = (state)->kecu.wide[ 0]; \
-		keca10 = (state)->kecu.wide[ 1]; \
-		keca20 = (state)->kecu.wide[ 2]; \
-		keca30 = (state)->kecu.wide[ 3]; \
-		keca40 = (state)->kecu.wide[ 4]; \
-		keca01 = (state)->kecu.wide[ 5]; \
-		keca11 = (state)->kecu.wide[ 6]; \
-		keca21 = (state)->kecu.wide[ 7]; \
-		keca31 = (state)->kecu.wide[ 8]; \
-		keca41 = (state)->kecu.wide[ 9]; \
-		keca02 = (state)->kecu.wide[10]; \
-		keca12 = (state)->kecu.wide[11]; \
-		keca22 = (state)->kecu.wide[12]; \
-		keca32 = (state)->kecu.wide[13]; \
-		keca42 = (state)->kecu.wide[14]; \
-		keca03 = (state)->kecu.wide[15]; \
-		keca13 = (state)->kecu.wide[16]; \
-		keca23 = (state)->kecu.wide[17]; \
-		keca33 = (state)->kecu.wide[18]; \
-		keca43 = (state)->kecu.wide[19]; \
-		keca04 = (state)->kecu.wide[20]; \
-		keca14 = (state)->kecu.wide[21]; \
-		keca24 = (state)->kecu.wide[22]; \
-		keca34 = (state)->kecu.wide[23]; \
-		keca44 = (state)->kecu.wide[24]; \
-	} while (0)
-
-#define kecREAD_STATE(state)   do { \
-		keca00 = kecu.wide[ 0]; \
-		keca10 = kecu.wide[ 1]; \
-		keca20 = kecu.wide[ 2]; \
-		keca30 = kecu.wide[ 3]; \
-		keca40 = kecu.wide[ 4]; \
-		keca01 = kecu.wide[ 5]; \
-		keca11 = kecu.wide[ 6]; \
-		keca21 = kecu.wide[ 7]; \
-		keca31 = kecu.wide[ 8]; \
-		keca41 = kecu.wide[ 9]; \
-		keca02 = kecu.wide[10]; \
-		keca12 = kecu.wide[11]; \
-		keca22 = kecu.wide[12]; \
-		keca32 = kecu.wide[13]; \
-		keca42 = kecu.wide[14]; \
-		keca03 = kecu.wide[15]; \
-		keca13 = kecu.wide[16]; \
-		keca23 = kecu.wide[17]; \
-		keca33 = kecu.wide[18]; \
-		keca43 = kecu.wide[19]; \
-		keca04 = kecu.wide[20]; \
-		keca14 = kecu.wide[21]; \
-		keca24 = kecu.wide[22]; \
-		keca34 = kecu.wide[23]; \
-		keca44 = kecu.wide[24]; \
-	} while (0)
-
-#define kecINIT_STATE()   do { \
-		keca00 = 0x0000000000000000  \
-		    ^ sph_dec64le_aligned(buf +   0); \
-		keca10 = 0xFFFFFFFFFFFFFFFF  \
-		    ^ sph_dec64le_aligned(buf +   8); \
-		keca20 = 0xFFFFFFFFFFFFFFFF  \
-		    ^ sph_dec64le_aligned(buf +  16); \
-		keca30 = 0x0000000000000000  \
-		    ^ sph_dec64le_aligned(buf +  24); \
-		keca40 = 0x0000000000000000  \
-		    ^ sph_dec64le_aligned(buf +  32); \
-		keca01 = 0x0000000000000000  \
-		    ^ sph_dec64le_aligned(buf +  40); \
-		keca11 = 0x0000000000000000  \
-		    ^ sph_dec64le_aligned(buf +  48); \
-		keca21 = 0x0000000000000000  \
-		    ^ sph_dec64le_aligned(buf +  56); \
-		keca31 = 0xFFFFFFFFFFFFFFFF  \
-		    ^ sph_dec64le_aligned(buf +  64); \
-		keca41 = 0x0000000000000000, \
-		keca02 = 0x0000000000000000, \
-		keca12 = 0x0000000000000000, \
-		keca32 = 0x0000000000000000, \
-		keca42 = 0x0000000000000000, \
-		keca03 = 0x0000000000000000, \
-		keca13 = 0x0000000000000000, \
-		keca33 = 0x0000000000000000, \
-		keca43 = 0x0000000000000000, \
-		keca14 = 0x0000000000000000, \
-		keca24 = 0x0000000000000000, \
-		keca34 = 0x0000000000000000, \
-		keca44 = 0x0000000000000000; \
-		keca23 = 0xFFFFFFFFFFFFFFFF, \
-		keca04 = 0xFFFFFFFFFFFFFFFF, \
-		keca22 = 0xFFFFFFFFFFFFFFFF; \
-	} while (0)
-
-#define kekWRITE_STATE(state)   do { \
-		(state)->kecu.wide[ 0] = keca00; \
-		(state)->kecu.wide[ 1] = ~keca10; \
-		(state)->kecu.wide[ 2] = ~keca20; \
-		(state)->kecu.wide[ 3] = keca30; \
-		(state)->kecu.wide[ 4] = keca40; \
-		(state)->kecu.wide[ 5] = keca01; \
-		(state)->kecu.wide[ 6] = keca11; \
-		(state)->kecu.wide[ 7] = keca21; \
-		(state)->kecu.wide[ 8] = ~keca31; \
-		(state)->kecu.wide[ 9] = keca41; \
-		(state)->kecu.wide[10] = keca02; \
-		(state)->kecu.wide[11] = keca12; \
-		(state)->kecu.wide[12] = ~keca22; \
-		(state)->kecu.wide[13] = keca32; \
-		(state)->kecu.wide[14] = keca42; \
-		(state)->kecu.wide[15] = keca03; \
-		(state)->kecu.wide[16] = keca13; \
-		(state)->kecu.wide[17] = ~keca23; \
-		(state)->kecu.wide[18] = keca33; \
-		(state)->kecu.wide[19] = keca43; \
-		(state)->kecu.wide[20] = ~keca04; \
-		(state)->kecu.wide[21] = keca14; \
-		(state)->kecu.wide[22] = keca24; \
-		(state)->kecu.wide[23] = keca34; \
-		(state)->kecu.wide[24] = keca44; \
-	} while (0)
-
-/* only usefull for one round final */
-#define kecWRITE_STATE(state)   do { \
-		kecu.wide[ 0] = keca00; \
-		kecu.wide[ 1] = ~keca10; \
-		kecu.wide[ 2] = ~keca20; \
-		kecu.wide[ 3] = keca30; \
-		kecu.wide[ 4] = keca40; \
-		kecu.wide[ 5] = keca01; \
-		kecu.wide[ 6] = keca11; \
-		kecu.wide[ 7] = keca21; \
-		kecu.wide[ 8] = ~keca31; \
-		kecu.wide[ 9] = keca41; \
-		kecu.wide[10] = keca02; \
-		kecu.wide[11] = keca12; \
-		kecu.wide[12] = ~keca22; \
-		kecu.wide[13] = keca32; \
-		kecu.wide[14] = keca42; \
-		kecu.wide[15] = keca03; \
-		kecu.wide[16] = keca13; \
-		kecu.wide[17] = ~keca23; \
-		kecu.wide[18] = keca33; \
-		kecu.wide[19] = keca43; \
-		kecu.wide[20] = ~keca04; \
-		kecu.wide[21] = keca14; \
-		kecu.wide[22] = keca24; \
-		kecu.wide[23] = keca34; \
-		kecu.wide[24] = keca44; \
-	} while (0)
-
-#define kecPRINT_STATE(state)   do { \
-		printf("keca00=%lX\n", keca00); \
-		printf("keca10=%lX\n", keca10); \
-		printf("keca20=%lX\n", keca20); \
-		printf("keca30=%lX\n", keca30); \
-		printf("keca40=%lX\n", keca40); \
-		printf("keca01=%lX\n", keca01); \
-		printf("keca11=%lX\n", keca11); \
-		printf("keca21=%lX\n", keca21); \
-		printf("keca31=%lX\n", keca31); \
-		printf("keca41=%lX\n", keca41); \
-		printf("keca02=%lX\n", keca02); \
-		printf("keca12=%lX\n", keca12); \
-		printf("keca22=%lX\n", keca22); \
-		printf("keca32=%lX\n", keca32); \
-		printf("keca42=%lX\n", keca42); \
-		printf("keca03=%lX\n", keca03); \
-		printf("keca13=%lX\n", keca13); \
-		printf("keca23=%lX\n", keca23); \
-		printf("keca33=%lX\n", keca33); \
-		printf("keca43=%lX\n", keca43); \
-		printf("keca04=%lX\n", keca04); \
-		printf("keca14=%lX\n", keca14); \
-		printf("keca24=%lX\n", keca24); \
-		printf("keca34=%lX\n", keca34); \
-		printf("keca44=%lX\n", keca44); \
-                abort(); \
-	} while (0)
-
-#define kekINPUT_BUF()   do { \
-	} while (0)
-
-
-#define kekDECL64(x)        sph_u64 x
-#define MOV64(d, s)      (d = s)
-#define XOR64(d, a, b)   (d = a ^ b)
-#define AND64(d, a, b)   (d = a & b)
-#define OR64(d, a, b)    (d = a | b)
-#define NOT64(d, s)      (d = SPH_T64(~s))
-#define ROL64(d, v, n)   (d = SPH_ROTL64(v, n))
-#define XOR64_IOTA       XOR64
-
-#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
-		kekDECL64(tt0); \
-		kekDECL64(tt1); \
-		kekDECL64(tt2); \
-		kekDECL64(tt3); \
-		XOR64(tt0, d0, d1); \
-		XOR64(tt1, d2, d3); \
-		XOR64(tt0, tt0, d4); \
-		XOR64(tt0, tt0, tt1); \
-		ROL64(tt0, tt0, 1); \
-		XOR64(tt2, c0, c1); \
-		XOR64(tt3, c2, c3); \
-		XOR64(tt0, tt0, c4); \
-		XOR64(tt2, tt2, tt3); \
-		XOR64(t, tt0, tt2); \
-	} while (0)
-
-#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-	b40, b41, b42, b43, b44) \
-	do { \
-		kekDECL64(t0); \
-		kekDECL64(t1); \
-		kekDECL64(t2); \
-		kekDECL64(t3); \
-		kekDECL64(t4); \
-		TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
-		TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
-		TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
-		TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
-		TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
-		XOR64(b00, b00, t0); \
-		XOR64(b01, b01, t0); \
-		XOR64(b02, b02, t0); \
-		XOR64(b03, b03, t0); \
-		XOR64(b04, b04, t0); \
-		XOR64(b10, b10, t1); \
-		XOR64(b11, b11, t1); \
-		XOR64(b12, b12, t1); \
-		XOR64(b13, b13, t1); \
-		XOR64(b14, b14, t1); \
-		XOR64(b20, b20, t2); \
-		XOR64(b21, b21, t2); \
-		XOR64(b22, b22, t2); \
-		XOR64(b23, b23, t2); \
-		XOR64(b24, b24, t2); \
-		XOR64(b30, b30, t3); \
-		XOR64(b31, b31, t3); \
-		XOR64(b32, b32, t3); \
-		XOR64(b33, b33, t3); \
-		XOR64(b34, b34, t3); \
-		XOR64(b40, b40, t4); \
-		XOR64(b41, b41, t4); \
-		XOR64(b42, b42, t4); \
-		XOR64(b43, b43, t4); \
-		XOR64(b44, b44, t4); \
-	} while (0)
-
-#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-	b40, b41, b42, b43, b44) \
-	do { \
-		/* ROL64(b00, b00,  0); */ \
-		ROL64(b01, b01, 36); \
-		ROL64(b02, b02,  3); \
-		ROL64(b03, b03, 41); \
-		ROL64(b04, b04, 18); \
-		ROL64(b10, b10,  1); \
-		ROL64(b11, b11, 44); \
-		ROL64(b12, b12, 10); \
-		ROL64(b13, b13, 45); \
-		ROL64(b14, b14,  2); \
-		ROL64(b20, b20, 62); \
-		ROL64(b21, b21,  6); \
-		ROL64(b22, b22, 43); \
-		ROL64(b23, b23, 15); \
-		ROL64(b24, b24, 61); \
-		ROL64(b30, b30, 28); \
-		ROL64(b31, b31, 55); \
-		ROL64(b32, b32, 25); \
-		ROL64(b33, b33, 21); \
-		ROL64(b34, b34, 56); \
-		ROL64(b40, b40, 27); \
-		ROL64(b41, b41, 20); \
-		ROL64(b42, b42, 39); \
-		ROL64(b43, b43,  8); \
-		ROL64(b44, b44, 14); \
-	} while (0)
-
-/*
- * The KHI macro integrates the "lane complement" optimization. On input,
- * some words are complemented:
- *    keca00 keca01 keca02 keca04 keca13 keca20 keca21 keca22 keca30 keca33 keca34 keca43
- * On output, the following words are complemented:
- *    keca04 keca10 keca20 keca22 keca23 keca31
- *
- * The (implicit) permutation and the theta expansion will bring back
- * the input mask for the next round.
- */
-
-#define KHI_XO(d, a, b, c)   do { \
-		kekDECL64(kt); \
-		OR64(kt, b, c); \
-		XOR64(d, a, kt); \
-	} while (0)
-
-#define KHI_XA(d, a, b, c)   do { \
-		kekDECL64(kt); \
-		AND64(kt, b, c); \
-		XOR64(d, a, kt); \
-	} while (0)
-
-#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-	b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-	b40, b41, b42, b43, b44) \
-	do { \
-		kekDECL64(c0); \
-		kekDECL64(c1); \
-		kekDECL64(c2); \
-		kekDECL64(c3); \
-		kekDECL64(c4); \
-		kekDECL64(bnn); \
-		NOT64(bnn, b20); \
-		KHI_XO(c0, b00, b10, b20); \
-		KHI_XO(c1, b10, bnn, b30); \
-		KHI_XA(c2, b20, b30, b40); \
-		KHI_XO(c3, b30, b40, b00); \
-		KHI_XA(c4, b40, b00, b10); \
-		MOV64(b00, c0); \
-		MOV64(b10, c1); \
-		MOV64(b20, c2); \
-		MOV64(b30, c3); \
-		MOV64(b40, c4); \
-		NOT64(bnn, b41); \
-		KHI_XO(c0, b01, b11, b21); \
-		KHI_XA(c1, b11, b21, b31); \
-		KHI_XO(c2, b21, b31, bnn); \
-		KHI_XO(c3, b31, b41, b01); \
-		KHI_XA(c4, b41, b01, b11); \
-		MOV64(b01, c0); \
-		MOV64(b11, c1); \
-		MOV64(b21, c2); \
-		MOV64(b31, c3); \
-		MOV64(b41, c4); \
-		NOT64(bnn, b32); \
-		KHI_XO(c0, b02, b12, b22); \
-		KHI_XA(c1, b12, b22, b32); \
-		KHI_XA(c2, b22, bnn, b42); \
-		KHI_XO(c3, bnn, b42, b02); \
-		KHI_XA(c4, b42, b02, b12); \
-		MOV64(b02, c0); \
-		MOV64(b12, c1); \
-		MOV64(b22, c2); \
-		MOV64(b32, c3); \
-		MOV64(b42, c4); \
-		NOT64(bnn, b33); \
-		KHI_XA(c0, b03, b13, b23); \
-		KHI_XO(c1, b13, b23, b33); \
-		KHI_XO(c2, b23, bnn, b43); \
-		KHI_XA(c3, bnn, b43, b03); \
-		KHI_XO(c4, b43, b03, b13); \
-		MOV64(b03, c0); \
-		MOV64(b13, c1); \
-		MOV64(b23, c2); \
-		MOV64(b33, c3); \
-		MOV64(b43, c4); \
-		NOT64(bnn, b14); \
-		KHI_XA(c0, b04, bnn, b24); \
-		KHI_XO(c1, bnn, b24, b34); \
-		KHI_XA(c2, b24, b34, b44); \
-		KHI_XO(c3, b34, b44, b04); \
-		KHI_XA(c4, b44, b04, b14); \
-		MOV64(b04, c0); \
-		MOV64(b14, c1); \
-		MOV64(b24, c2); \
-		MOV64(b34, c3); \
-		MOV64(b44, c4); \
-	} while (0)
-
-#define IOTA(r)   XOR64_IOTA(keca00, keca00, r)
-
-#define P0    keca00, keca01, keca02, keca03, keca04, keca10, keca11, keca12, keca13, keca14, keca20, keca21, \
-              keca22, keca23, keca24, keca30, keca31, keca32, keca33, keca34, keca40, keca41, keca42, keca43, keca44
-#define P1    keca00, keca30, keca10, keca40, keca20, keca11, keca41, keca21, keca01, keca31, keca22, keca02, \
-              keca32, keca12, keca42, keca33, keca13, keca43, keca23, keca03, keca44, keca24, keca04, keca34, keca14
-#define P2    keca00, keca33, keca11, keca44, keca22, keca41, keca24, keca02, keca30, keca13, keca32, keca10, \
-              keca43, keca21, keca04, keca23, keca01, keca34, keca12, keca40, keca14, keca42, keca20, keca03, keca31
-#define P3    keca00, keca23, keca41, keca14, keca32, keca24, keca42, keca10, keca33, keca01, keca43, keca11, \
-              keca34, keca02, keca20, keca12, keca30, keca03, keca21, keca44, keca31, keca04, keca22, keca40, keca13
-#define P4    keca00, keca12, keca24, keca31, keca43, keca42, keca04, keca11, keca23, keca30, keca34, keca41, \
-              keca03, keca10, keca22, keca21, keca33, keca40, keca02, keca14, keca13, keca20, keca32, keca44, keca01
-#define P5    keca00, keca21, keca42, keca13, keca34, keca04, keca20, keca41, keca12, keca33, keca03, keca24, \
-              keca40, keca11, keca32, keca02, keca23, keca44, keca10, keca31, keca01, keca22, keca43, keca14, keca30
-#define P6    keca00, keca02, keca04, keca01, keca03, keca20, keca22, keca24, keca21, keca23, keca40, keca42, \
-              keca44, keca41, keca43, keca10, keca12, keca14, keca11, keca13, keca30, keca32, keca34, keca31, keca33
-#define P7    keca00, keca10, keca20, keca30, keca40, keca22, keca32, keca42, keca02, keca12, keca44, keca04, \
-              keca14, keca24, keca34, keca11, keca21, keca31, keca41, keca01, keca33, keca43, keca03, keca13, keca23
-#define P8    keca00, keca11, keca22, keca33, keca44, keca32, keca43, keca04, keca10, keca21, keca14, keca20, \
-              keca31, keca42, keca03, keca41, keca02, keca13, keca24, keca30, keca23, keca34, keca40, keca01, keca12
-#define P9    keca00, keca41, keca32, keca23, keca14, keca43, keca34, keca20, keca11, keca02, keca31, keca22, \
-              keca13, keca04, keca40, keca24, keca10, keca01, keca42, keca33, keca12, keca03, keca44, keca30, keca21
-#define P10   keca00, keca24, keca43, keca12, keca31, keca34, keca03, keca22, keca41, keca10, keca13, keca32, \
-              keca01, keca20, keca44, keca42, keca11, keca30, keca04, keca23, keca21, keca40, keca14, keca33, keca02
-#define P11   keca00, keca42, keca34, keca21, keca13, keca03, keca40, keca32, keca24, keca11, keca01, keca43, \
-              keca30, keca22, keca14, keca04, keca41, keca33, keca20, keca12, keca02, keca44, keca31, keca23, keca10
-#define P12   keca00, keca04, keca03, keca02, keca01, keca40, keca44, keca43, keca42, keca41, keca30, keca34, \
-              keca33, keca32, keca31, keca20, keca24, keca23, keca22, keca21, keca10, keca14, keca13, keca12, keca11
-#define P13   keca00, keca20, keca40, keca10, keca30, keca44, keca14, keca34, keca04, keca24, keca33, keca03, \
-              keca23, keca43, keca13, keca22, keca42, keca12, keca32, keca02, keca11, keca31, keca01, keca21, keca41
-#define P14   keca00, keca22, keca44, keca11, keca33, keca14, keca31, keca03, keca20, keca42, keca23, keca40, \
-              keca12, keca34, keca01, keca32, keca04, keca21, keca43, keca10, keca41, keca13, keca30, keca02, keca24
-#define P15   keca00, keca32, keca14, keca41, keca23, keca31, keca13, keca40, keca22, keca04, keca12, keca44, \
-              keca21, keca03, keca30, keca43, keca20, keca02, keca34, keca11, keca24, keca01, keca33, keca10, keca42
-#define P16   keca00, keca43, keca31, keca24, keca12, keca13, keca01, keca44, keca32, keca20, keca21, keca14, \
-              keca02, keca40, keca33, keca34, keca22, keca10, keca03, keca41, keca42, keca30, keca23, keca11, keca04
-#define P17   keca00, keca34, keca13, keca42, keca21, keca01, keca30, keca14, keca43, keca22, keca02, keca31, \
-              keca10, keca44, keca23, keca03, keca32, keca11, keca40, keca24, keca04, keca33, keca12, keca41, keca20
-#define P18   keca00, keca03, keca01, keca04, keca02, keca30, keca33, keca31, keca34, keca32, keca10, keca13, \
-              keca11, keca14, keca12, keca40, keca43, keca41, keca44, keca42, keca20, keca23, keca21, keca24, keca22
-#define P19   keca00, keca40, keca30, keca20, keca10, keca33, keca23, keca13, keca03, keca43, keca11, keca01, \
-              keca41, keca31, keca21, keca44, keca34, keca24, keca14, keca04, keca22, keca12, keca02, keca42, keca32
-#define P20   keca00, keca44, keca33, keca22, keca11, keca23, keca12, keca01, keca40, keca34, keca41, keca30, \
-              keca24, keca13, keca02, keca14, keca03, keca42, keca31, keca20, keca32, keca21, keca10, keca04, keca43
-#define P21   keca00, keca14, keca23, keca32, keca41, keca12, keca21, keca30, keca44, keca03, keca24, keca33, \
-              keca42, keca01, keca10, keca31, keca40, keca04, keca13, keca22, keca43, keca02, keca11, keca20, keca34
-#define P22   keca00, keca31, keca12, keca43, keca24, keca21, keca02, keca33, keca14, keca40, keca42, keca23, \
-              keca04, keca30, keca11, keca13, keca44, keca20, keca01, keca32, keca34, keca10, keca41, keca22, keca03
-#define P23   keca00, keca13, keca21, keca34, keca42, keca02, keca10, keca23, keca31, keca44, keca04, keca12, \
-              keca20, keca33, keca41, keca01, keca14, keca22, keca30, keca43, keca03, keca11, keca24, keca32, keca40
-
-#define P1_TO_P0   do { \
-		kekDECL64(t); \
-		MOV64(t, keca01); \
-		MOV64(keca01, keca30); \
-		MOV64(keca30, keca33); \
-		MOV64(keca33, keca23); \
-		MOV64(keca23, keca12); \
-		MOV64(keca12, keca21); \
-		MOV64(keca21, keca02); \
-		MOV64(keca02, keca10); \
-		MOV64(keca10, keca11); \
-		MOV64(keca11, keca41); \
-		MOV64(keca41, keca24); \
-		MOV64(keca24, keca42); \
-		MOV64(keca42, keca04); \
-		MOV64(keca04, keca20); \
-		MOV64(keca20, keca22); \
-		MOV64(keca22, keca32); \
-		MOV64(keca32, keca43); \
-		MOV64(keca43, keca34); \
-		MOV64(keca34, keca03); \
-		MOV64(keca03, keca40); \
-		MOV64(keca40, keca44); \
-		MOV64(keca44, keca14); \
-		MOV64(keca14, keca31); \
-		MOV64(keca31, keca13); \
-		MOV64(keca13, t); \
-	} while (0)
-
-#define P2_TO_P0   do { \
-		kekDECL64(t); \
-		MOV64(t, keca01); \
-		MOV64(keca01, keca33); \
-		MOV64(keca33, keca12); \
-		MOV64(keca12, keca02); \
-		MOV64(keca02, keca11); \
-		MOV64(keca11, keca24); \
-		MOV64(keca24, keca04); \
-		MOV64(keca04, keca22); \
-		MOV64(keca22, keca43); \
-		MOV64(keca43, keca03); \
-		MOV64(keca03, keca44); \
-		MOV64(keca44, keca31); \
-		MOV64(keca31, t); \
-		MOV64(t, keca10); \
-		MOV64(keca10, keca41); \
-		MOV64(keca41, keca42); \
-		MOV64(keca42, keca20); \
-		MOV64(keca20, keca32); \
-		MOV64(keca32, keca34); \
-		MOV64(keca34, keca40); \
-		MOV64(keca40, keca14); \
-		MOV64(keca14, keca13); \
-		MOV64(keca13, keca30); \
-		MOV64(keca30, keca23); \
-		MOV64(keca23, keca21); \
-		MOV64(keca21, t); \
-	} while (0)
-
-#define P4_TO_P0   do { \
-		kekDECL64(t); \
-		MOV64(t, keca01); \
-		MOV64(keca01, keca12); \
-		MOV64(keca12, keca11); \
-		MOV64(keca11, keca04); \
-		MOV64(keca04, keca43); \
-		MOV64(keca43, keca44); \
-		MOV64(keca44, t); \
-		MOV64(t, keca02); \
-		MOV64(keca02, keca24); \
-		MOV64(keca24, keca22); \
-		MOV64(keca22, keca03); \
-		MOV64(keca03, keca31); \
-		MOV64(keca31, keca33); \
-		MOV64(keca33, t); \
-		MOV64(t, keca10); \
-		MOV64(keca10, keca42); \
-		MOV64(keca42, keca32); \
-		MOV64(keca32, keca40); \
-		MOV64(keca40, keca13); \
-		MOV64(keca13, keca23); \
-		MOV64(keca23, t); \
-		MOV64(t, keca14); \
-		MOV64(keca14, keca30); \
-		MOV64(keca30, keca21); \
-		MOV64(keca21, keca41); \
-		MOV64(keca41, keca20); \
-		MOV64(keca20, keca34); \
-		MOV64(keca34, t); \
-	} while (0)
-
-#define P6_TO_P0   do { \
-		kekDECL64(t); \
-		MOV64(t, keca01); \
-		MOV64(keca01, keca02); \
-		MOV64(keca02, keca04); \
-		MOV64(keca04, keca03); \
-		MOV64(keca03, t); \
-		MOV64(t, keca10); \
-		MOV64(keca10, keca20); \
-		MOV64(keca20, keca40); \
-		MOV64(keca40, keca30); \
-		MOV64(keca30, t); \
-		MOV64(t, keca11); \
-		MOV64(keca11, keca22); \
-		MOV64(keca22, keca44); \
-		MOV64(keca44, keca33); \
-		MOV64(keca33, t); \
-		MOV64(t, keca12); \
-		MOV64(keca12, keca24); \
-		MOV64(keca24, keca43); \
-		MOV64(keca43, keca31); \
-		MOV64(keca31, t); \
-		MOV64(t, keca13); \
-		MOV64(keca13, keca21); \
-		MOV64(keca21, keca42); \
-		MOV64(keca42, keca34); \
-		MOV64(keca34, t); \
-		MOV64(t, keca14); \
-		MOV64(keca14, keca23); \
-		MOV64(keca23, keca41); \
-		MOV64(keca41, keca32); \
-		MOV64(keca32, t); \
-	} while (0)
-
-#define P8_TO_P0   do { \
-		kekDECL64(t); \
-		MOV64(t, keca01); \
-		MOV64(keca01, keca11); \
-		MOV64(keca11, keca43); \
-		MOV64(keca43, t); \
-		MOV64(t, keca02); \
-		MOV64(keca02, keca22); \
-		MOV64(keca22, keca31); \
-		MOV64(keca31, t); \
-		MOV64(t, keca03); \
-		MOV64(keca03, keca33); \
-		MOV64(keca33, keca24); \
-		MOV64(keca24, t); \
-		MOV64(t, keca04); \
-		MOV64(keca04, keca44); \
-		MOV64(keca44, keca12); \
-		MOV64(keca12, t); \
-		MOV64(t, keca10); \
-		MOV64(keca10, keca32); \
-		MOV64(keca32, keca13); \
-		MOV64(keca13, t); \
-		MOV64(t, keca14); \
-		MOV64(keca14, keca21); \
-		MOV64(keca21, keca20); \
-		MOV64(keca20, t); \
-		MOV64(t, keca23); \
-		MOV64(keca23, keca42); \
-		MOV64(keca42, keca40); \
-		MOV64(keca40, t); \
-		MOV64(t, keca30); \
-		MOV64(keca30, keca41); \
-		MOV64(keca41, keca34); \
-		MOV64(keca34, t); \
-	} while (0)
-
-#define P12_TO_P0   do { \
-		kekDECL64(t); \
-		MOV64(t, keca01); \
-		MOV64(keca01, keca04); \
-		MOV64(keca04, t); \
-		MOV64(t, keca02); \
-		MOV64(keca02, keca03); \
-		MOV64(keca03, t); \
-		MOV64(t, keca10); \
-		MOV64(keca10, keca40); \
-		MOV64(keca40, t); \
-		MOV64(t, keca11); \
-		MOV64(keca11, keca44); \
-		MOV64(keca44, t); \
-		MOV64(t, keca12); \
-		MOV64(keca12, keca43); \
-		MOV64(keca43, t); \
-		MOV64(t, keca13); \
-		MOV64(keca13, keca42); \
-		MOV64(keca42, t); \
-		MOV64(t, keca14); \
-		MOV64(keca14, keca41); \
-		MOV64(keca41, t); \
-		MOV64(t, keca20); \
-		MOV64(keca20, keca30); \
-		MOV64(keca30, t); \
-		MOV64(t, keca21); \
-		MOV64(keca21, keca34); \
-		MOV64(keca34, t); \
-		MOV64(t, keca22); \
-		MOV64(keca22, keca33); \
-		MOV64(keca33, t); \
-		MOV64(t, keca23); \
-		MOV64(keca23, keca32); \
-		MOV64(keca32, t); \
-		MOV64(t, keca24); \
-		MOV64(keca24, keca31); \
-		MOV64(keca31, t); \
-	} while (0)
-
-#define LPAR   (
-#define RPAR   )
-
-#define KF_ELT(r, s, k)   do { \
-		THETA LPAR P ## r RPAR; \
-		RHO LPAR P ## r RPAR; \
-		KHI LPAR P ## s RPAR; \
-		IOTA(k); \
-	} while (0)
-
-#define DO(x)   x
-
-#define KECCAK_F_1600   DO(KECCAK_F_1600_)
-
-/*
- * removed loop unrolling 
- * tested faster saving space
-*/
-#define KECCAK_F_1600_   do { \
-static const sph_u64 RC[] = { \
-        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \
-        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \
-        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \
-        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \
-        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \
-        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \
-        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \
-        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \
-        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \
-        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \
-}; \
-		int j; \
-		for (j = 0; j < 24; j += 4) { \
-			KF_ELT( 0,  1, RC[j + 0]); \
-			KF_ELT( 1,  2, RC[j + 1]); \
-			KF_ELT( 2,  3, RC[j + 2]); \
-			KF_ELT( 3,  4, RC[j + 3]); \
-			P4_TO_P0; \
-		} \
-	} while (0)
-
-/*
-			KF_ELT( 0,  1, RC[j + 0]); \
-			KF_ELT( 1,  2, RC[j + 1]); \
-			KF_ELT( 2,  3, RC[j + 2]); \
-			KF_ELT( 3,  4, RC[j + 3]); \
-			KF_ELT( 4,  5, RC[j + 4]); \
-			KF_ELT( 5,  6, RC[j + 5]); \
-			KF_ELT( 6,  7, RC[j + 6]); \
-			KF_ELT( 7,  8, RC[j + 7]); \
-	kekDECL_STATE \
-*/        
-#define DECL_KEC  
-
-
-/* 
-	sph_u64 keca00, keca01, keca02, keca03, keca04; \
-	sph_u64 keca10, keca11, keca12, keca13, keca14; \
-	sph_u64 keca20, keca21, keca22, keca23, keca24; \
-	sph_u64 keca30, keca31, keca32, keca33, keca34; \
-	sph_u64 keca40, keca41, keca42, keca43, keca44;
-*/
-
-/* load initial constants */
-#define KEC_I 
-
-//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; 
-/*
- unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
-*/
-
-/* load hash for loop */
-#define KEC_U \
-do { \
-static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
-    /*memcpy(hashbuf, hash, 64); */ \
-    memcpy(hash + 64, keczword, 8); \
-} while (0); 
-
-/* keccak512 hash loaded */
-/* hash = keccak512(loaded */
-
-#define KEC_C \
-do { \
-    kekDECL_STATE \
-    unsigned char *buf = hash; \
-    /*BEGIN CORE */ \
-    kecINIT_STATE(); \
-    KECCAK_F_1600; \
-    /*END CORE */ \
-    /* Finalize the "lane complement" */ \
-    sph_enc64le_aligned((unsigned char*)(hash) +  0,  keca00); \
-    sph_enc64le_aligned((unsigned char*)(hash) +  8, ~keca10); \
-    sph_enc64le_aligned((unsigned char*)(hash) + 16, ~keca20); \
-    sph_enc64le_aligned((unsigned char*)(hash) + 24,  keca30); \
-    sph_enc64le_aligned((unsigned char*)(hash) + 32,  keca40); \
-    sph_enc64le_aligned((unsigned char*)(hash) + 40,  keca01); \
-    sph_enc64le_aligned((unsigned char*)(hash) + 48,  keca11); \
-    sph_enc64le_aligned((unsigned char*)(hash) + 56,  keca21); \
-} while (0);
-
-#ifdef __cplusplus
-}
-#endif
--- a/algo/keccak/sse2/sph_keccak.h
+++ b/algo/keccak/sse2/sph_keccak.h
@@ -1,102 +0,0 @@
-/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * Keccak interface. This is the interface for Keccak with the
- * recommended parameters for SHA-3, with output lengths 224, 256,
- * 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_keccak.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_KECCAK_H__
-#define SPH_KECCAK_H__
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-
-#define QSTATIC static 
-
-/**
- * Output size (in bits) for Keccak-512.
- */
-#define SPH_SIZE_keccak512   512
-
-/**
- * This structure is a context for Keccak computations: it contains the
- * intermediate values and some data from the last entered block. Once a
- * Keccak computation has been performed, the context can be reused for
- * another computation.
- *
- * The contents of this structure are private. A running Keccak computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-/**
- * Type for a Keccak-512 context (identical to the common context).
- */
-
-/**
- * Initialize a Keccak-512 context. This process performs no memory allocation.
- *
- * @param cc   the Keccak-512 context (pointer to a
- *             <code>sph_keccak512_context</code>)
- */
-
-/**
- * Terminate the current Keccak-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the Keccak-512 context
- * @param dst   the destination buffer
- */
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the Keccak-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -459,6 +459,11 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
    return 0;
 }

+int luffa512_4way_init( luffa_4way_context *state )
+{
+   return luffa_4way_init( state, 512 );
+}
+   
 // Do not call luffa_update_close after having called luffa_update.
 // Once luffa_update has been called only call luffa_update or luffa_close.
 int luffa_4way_update( luffa_4way_context *state, const void *data,
@@ -496,6 +501,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    return 0;
 }

+/*
+int luffa512_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len )
+{
+   return luffa_4way_update( state, data, len );
+}
+*/
+
 int luffa_4way_close( luffa_4way_context *state, void *hashval )
 {
    __m512i *buffer = (__m512i*)state->buffer;
@@ -518,6 +531,77 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
    return 0;
 }

+/*
+int luffa512_4way_close( luffa_4way_context *state, void *hashval )
+{
+   return luffa_4way_close( state, hashval );
+}
+*/
+
+int luffa512_4way_full( luffa_4way_context *state, void *output,
+                        const void *data, size_t inlen )
+{
+    state->hashbitlen = 512;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m512_const1_128( iv[0] );
+    state->chainv[1] = m512_const1_128( iv[1] );
+    state->chainv[2] = m512_const1_128( iv[2] );
+    state->chainv[3] = m512_const1_128( iv[3] );
+    state->chainv[4] = m512_const1_128( iv[4] );
+    state->chainv[5] = m512_const1_128( iv[5] );
+    state->chainv[6] = m512_const1_128( iv[6] );
+    state->chainv[7] = m512_const1_128( iv[7] );
+    state->chainv[8] = m512_const1_128( iv[8] );
+    state->chainv[9] = m512_const1_128( iv[9] );
+
+    ((__m512i*)state->buffer)[0] = m512_zero;
+    ((__m512i*)state->buffer)[1] = m512_zero;
+
+    const __m512i *vdata  = (__m512i*)data;
+    __m512i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m512i shuff_bswap32 = m512_const_64(
+                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
+                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
+                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_4way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
+       rnd512_4way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m512_zero;
+       rnd512_4way( state, msg );
+    }
+
+    finalization512_4way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_4way( state, (uint32*)( output+64 ) );
+
+    return 0;
+}
+
 int luffa_4way_update_close( luffa_4way_context *state,
                 void *output, const void *data, size_t inlen )
 {
@@ -1031,6 +1115,69 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
    return 0;
 }

+int luffa512_2way_full( luffa_2way_context *state, void *output,
+                        const void *data, size_t inlen )
+{
+    state->hashbitlen = 512;
+    __m128i *iv = (__m128i*)IV;
+
+    state->chainv[0] = m256_const1_128( iv[0] );
+    state->chainv[1] = m256_const1_128( iv[1] );
+    state->chainv[2] = m256_const1_128( iv[2] );
+    state->chainv[3] = m256_const1_128( iv[3] );
+    state->chainv[4] = m256_const1_128( iv[4] );
+    state->chainv[5] = m256_const1_128( iv[5] );
+    state->chainv[6] = m256_const1_128( iv[6] );
+    state->chainv[7] = m256_const1_128( iv[7] );
+    state->chainv[8] = m256_const1_128( iv[8] );
+    state->chainv[9] = m256_const1_128( iv[9] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    const __m256i *vdata  = (__m256i*)data;
+    __m256i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
+                                                 0x1415161710111213,
+                                                 0x0c0d0e0f08090a0b,
+                                                 0x0405060700010203 );
+
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
+       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
+       rnd512_2way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
+       msg[1] = m256_zero;
+       rnd512_2way( state, msg );
+    }
+
+    finalization512_2way( state, (uint32*)output );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+32 ) );
+
+    return 0;
+}
+
 int luffa_2way_update_close( luffa_2way_context *state,
                 void *output, const void *data, size_t inlen )
 {
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -61,11 +61,23 @@ typedef struct {
 } luffa_4way_context __attribute((aligned(128)));

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
-int luffa_4way_update( luffa_4way_context *state, const void *data,
-                       size_t len );
-int luffa_4way_close( luffa_4way_context *state, void *hashval );
+//int luffa_4way_update( luffa_4way_context *state, const void *data,
+//                       size_t len );
+//int luffa_4way_close( luffa_4way_context *state, void *hashval );
 int luffa_4way_update_close( luffa_4way_context *state, void *output,
                                   const void *data, size_t inlen );
+int luffa512_4way_full( luffa_4way_context *state, void *output,
+                         const void *data, size_t inlen );
+int luffa512_4way_init( luffa_4way_context *state );
+int luffa512_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len );
+int luffa512_4way_close( luffa_4way_context *state, void *hashval );
+int luffa512_4way_update_close( luffa_4way_context *state, void *output,
+                                const void *data, size_t inlen );
+
+#define luffa_4way_update       luffa512_4way_update
+#define luffa_4way_close        luffa512_4way_close
+#define luffa_4way_update_close luffa512_4way_update_close

 #endif

@@ -82,6 +94,8 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
 int luffa_2way_close( luffa_2way_context *state, void *hashval );
 int luffa_2way_update_close( luffa_2way_context *state, void *output,
                                   const void *data, size_t inlen );
+int luffa512_2way_full( luffa_2way_context *state, void *output,
+                         const void *data, size_t inlen );

 #endif
 #endif
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -7,33 +7,44 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl256-hash-4way.h"
+#endif

-#if defined (ALLIUM_8WAY)  
+#if defined (ALLIUM_16WAY)  

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_16way_context     blake;
   keccak256_8way_context    keccak;
   cube_4way_context          cube;
   skein256_8way_context     skein;
+#if defined(__VAES__)
+   groestl256_4way_context groestl;
+#else
   hashState_groestl256      groestl;
-} allium_8way_ctx_holder;
+#endif
+} allium_16way_ctx_holder;

-static __thread allium_8way_ctx_holder allium_8way_ctx;
+static __thread allium_16way_ctx_holder allium_16way_ctx;

-bool init_allium_8way_ctx()
+bool init_allium_16way_ctx()
 {
-   keccak256_8way_init( &allium_8way_ctx.keccak );
-   cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &allium_8way_ctx.skein );
-   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   keccak256_8way_init( &allium_16way_ctx.keccak );
+   cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &allium_16way_ctx.skein );
+#if defined(__VAES__)
+   groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
+#else
+   init_groestl256( &allium_16way_ctx.groestl, 32 );
+#endif
   return true;
 }

-void allium_8way_hash( void *state, const void *input )
+void allium_16way_hash( void *state, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
-   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (64)));
   uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -42,19 +53,39 @@ void allium_8way_hash( void *state, const void *input )
   uint32_t hash5[8] __attribute__ ((aligned (64)));
   uint32_t hash6[8] __attribute__ ((aligned (64)));
   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
+   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16way_close( &ctx.blake, vhash );

-   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                  vhash, 256 );
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );
+   
+//   rintrlv_8x32_8x64( vhashA, vhash, 256 );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8way_close( &ctx.keccak, vhashA);
+   keccak256_8way_init( &ctx.keccak );
+   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8way_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
-
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );

   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
@@ -68,20 +99,19 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
  
-/* 
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
-   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
-   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
-   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
-*/
-
-
-
   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );

@@ -92,6 +122,17 @@ void allium_8way_hash( void *state, const void *input )
   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );

+   intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
+   intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
+
   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash0, hash1, vhash, 256 );
@@ -104,9 +145,208 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );
+
+   skein256_8way_update( &ctx.skein, vhashA, 32 );
+   skein256_8way_close( &ctx.skein, vhashA );
+   skein256_8way_init( &ctx.skein );
+   skein256_8way_update( &ctx.skein, vhashB, 32 );
+   skein256_8way_close( &ctx.skein, vhashB );


-/*
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );
+
+#if defined(__VAES__)
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+   
+   dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+ 
+   dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
+   
+#else
+
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+256, hash8, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+288, hash9, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+320, hash10, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+352, hash11, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+384, hash12, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+416, hash13, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+448, hash14, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+480, hash15, 256 );
+
+#endif
+}
+
+int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+
+   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+
+   blake256_16way_init( &allium_16way_ctx.blake );
+   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
+
+   do {
+     allium_16way_hash( hash, vdata );
+
+     for ( int lane = 0; lane < 16; lane++ ) 
+     if unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench )
+     {
+         pdata[19] = bswap_32( n + lane );
+         submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+     }
+     *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+     n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (ALLIUM_8WAY)  
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   hashState_groestl256      groestl;
+
+} allium_8way_ctx_holder;
+
+static __thread allium_8way_ctx_holder allium_8way_ctx;
+
+bool init_allium_8way_ctx()
+{
+   keccak256_4way_init( &allium_8way_ctx.keccak );
+   cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &allium_8way_ctx.skein );
+   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_8way_hash( void *hash, const void *input )
+{
+   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
+//   uint64_t hash[4*8] __attribute__ ((aligned (64)));
+   uint64_t *hash0 = (uint64_t*)hash;
+   uint64_t *hash1 = (uint64_t*)hash+ 4;
+   uint64_t *hash2 = (uint64_t*)hash+ 8;
+   uint64_t *hash3 = (uint64_t*)hash+12;
+   uint64_t *hash4 = (uint64_t*)hash+16;
+   uint64_t *hash5 = (uint64_t*)hash+20;
+   uint64_t *hash6 = (uint64_t*)hash+24;
+   uint64_t *hash7 = (uint64_t*)hash+28;
+   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 
+
+   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhashA );
+
+   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                     vhashA, 256 );
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashA );
+   keccak256_4way_init( &ctx.keccak );
+   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashB );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
+
   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
@@ -115,136 +355,6 @@ void allium_8way_hash( void *state, const void *input )
   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
-*/
-
-
-
-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                hash7, 256 );
-
-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );
-
-   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
-
-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-}
-
-int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 8;
-   const uint32_t Htarg = ptarget[7];
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
-
-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   blake256_8way_init( &allium_8way_ctx.blake );
-   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
-
-   do {
-     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
-                                                 n+3, n+2, n+1, n ) );
-
-     allium_8way_hash( hash, vdata );
-     pdata[19] = n;
-
-     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
-     {
-        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
-         }
-     }
-     n += 8;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-
-#elif defined (ALLIUM_4WAY)  
-
-
-typedef struct {
-   blake256_4way_context     blake;
-   keccak256_4way_context    keccak;
-   cubehashParam             cube;
-   skein256_4way_context     skein;
-   hashState_groestl256      groestl;
-
-} allium_4way_ctx_holder;
-
-static __thread allium_4way_ctx_holder allium_4way_ctx;
-
-bool init_allium_4way_ctx()
-{
-   keccak256_4way_init( &allium_4way_ctx.keccak );
-   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &allium_4way_ctx.skein );
-   init_groestl256( &allium_4way_ctx.groestl, 32 );
-   return true;
-}
-
-void allium_4way_hash( void *state, const void *input )
-{
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (32)));
-   uint32_t hash2[8] __attribute__ ((aligned (32)));
-   uint32_t hash3[8] __attribute__ ((aligned (32)));
-   uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
-   uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
-   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
-
-   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash32 );
-
-   rintrlv_4x32_4x64( vhash64, vhash32, 256 );
-   keccak256_4way( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
-
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
-
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );

   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -253,69 +363,97 @@ void allium_4way_hash( void *state, const void *input )
   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );

   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );

-   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );

-   skein256_4way( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
+   skein256_4way_update( &ctx.skein, vhashA, 32 );
+   skein256_4way_close( &ctx.skein, vhashA );
+   skein256_4way_init( &ctx.skein );
+   skein256_4way_update( &ctx.skein, vhashB, 32 );
+   skein256_4way_close( &ctx.skein, vhashB );

-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash4, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash5, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash6, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash7, hash7, 256 );
 }

-int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint64_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;  
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );

-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256_4way_init( &allium_4way_ctx.blake );
-   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
+   blake256_8way_init( &allium_8way_ctx.blake );
+   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );

   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+     allium_8way_hash( hash, vdata );

-     allium_4way_hash( hash, vdata );
-     pdata[19] = n;
-
-     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     for ( int lane = 0; lane < 8; lane++ )
     {
-        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        const uint64_t *lane_hash = hash + (lane<<2);
+        if unlikely( valid_hash( lane_hash, ptarget ) && !bench )
        {
-           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
-         }
+           pdata[19] = bswap_32( n + lane );
+           submit_lane_solution( work, lane_hash, mythr, lane );
+        }
     }
-     n += 4;
-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
+     n += 8;
+     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+   } while likely( (n <= last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -78,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -119,7 +119,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -146,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -165,7 +165,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -174,20 +174,20 @@ bool register_lyra2h_algo( algo_gate_t* gate )

 bool register_allium_algo( algo_gate_t* gate )
 {
-#if defined (ALLIUM_8WAY)
+#if defined (ALLIUM_16WAY)
+  gate->miner_thread_init = (void*)&init_allium_16way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_16way;
+  gate->hash      = (void*)&allium_16way_hash;
+#elif defined (ALLIUM_8WAY)
  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_8way;
  gate->hash      = (void*)&allium_8way_hash;
-#elif defined (ALLIUM_4WAY)
-  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
-  gate->scanhash  = (void*)&scanhash_allium_4way;
-  gate->hash      = (void*)&allium_4way_hash;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -220,7 +220,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   // Assemble block header
   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
                  (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
-                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
+                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits), NULL );
   for ( t = 0; t < 16; t++ )
      g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
 }
@@ -229,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_phi2_algo( algo_gate_t* gate )
 {
 //   init_phi2_ctx();
-   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -153,27 +153,27 @@ bool lyra2h_thread_init();
 //////////////////////////////////

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define ALLIUM_8WAY 1
+  #define ALLIUM_16WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY 1
+  #define ALLIUM_8WAY 1
 #endif

 bool register_allium_algo( algo_gate_t* gate );

-#if defined(ALLIUM_8WAY)
+#if defined(ALLIUM_16WAY)
+
+void allium_16way_hash( void *state, const void *input );
+int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool init_allium_16way_ctx();
+
+#elif defined(ALLIUM_8WAY)

 void allium_8way_hash( void *state, const void *input );
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_8way_ctx();

-#elif defined(ALLIUM_4WAY)
-
-void allium_4way_hash( void *state, const void *input );
-int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_4way_ctx();
-
 #else

 void allium_hash( void *state, const void *input );
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -575,4 +575,138 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
   return 0;
 }

+int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
+                  const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+   memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
+
+   uint64_t *ptrWord = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa0 = (rowa0 + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa0 == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way_X( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   //================== Freeing the memory =============================//
+   _mm_free(wholeMatrix);
+
+   return 0;
+}
+      
 #endif
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -74,6 +74,9 @@ int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
          uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
 #endif

 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -20,7 +20,7 @@ static __thread blake256_4way_context l2h_4way_blake_mid;
 void lyra2h_4way_midstate( const void* input )
 {
       blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
 }

 void lyra2h_4way_hash( void *state, const void *input )
@@ -33,7 +33,7 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
-     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -44,7 +44,7 @@ void lyra2rev2_8way_hash( void *state, const void *input )
   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );

-   blake256_8way( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
   blake256_8way_close( &ctx.blake, vhash );

   rintrlv_8x32_8x64( vhashA, vhash, 256 );
@@ -176,12 +176,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash );

   rintrlv_4x32_4x64( vhash64, vhash, 256 );

-   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -201,7 +201,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );

-   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_update( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -217,7 +217,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
 }

@@ -242,7 +242,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
   mm128_bswap32_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -209,7 +209,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way( &ctx.blake, input + (64*8), 16 );
+   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
   blake256_8way_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -252,7 +252,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way( &ctx.bmw, vhash, 32 );
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
   bmw256_8way_close( &ctx.bmw, state );

   }
@@ -277,7 +277,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
   mm256_bswap32_intrlv80_8x32( vdata, pdata );

   blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -334,8 +334,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-//   blake256_4way( &ctx.blake, input, 80 );
-   blake256_4way( &ctx.blake, input + (64*4), 16 );
+   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
   blake256_4way_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -358,7 +357,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
 }

@@ -383,7 +382,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   mm128_bswap32_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -149,7 +149,7 @@ static __thread blake256_8way_context l2z_8way_blake_mid;
 void lyra2z_8way_midstate( const void* input )
 {
       blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way( &l2z_8way_blake_mid, input, 64 );
+       blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
 }

 void lyra2z_8way_hash( void *state, const void *input )
@@ -166,7 +166,7 @@ void lyra2z_8way_hash( void *state, const void *input )
     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_update( &ctx_blake, input + (64*8), 16 );
     blake256_8way_close( &ctx_blake, vhash );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -247,7 +247,7 @@ static __thread blake256_4way_context l2z_4way_blake_mid;
 void lyra2z_4way_midstate( const void* input )
 {
       blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way( &l2z_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
 }

 void lyra2z_4way_hash( void *state, const void *input )
@@ -260,7 +260,7 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -196,7 +196,6 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
-    __m512i  t0, t1, t2;

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -218,24 +217,27 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       out[1] = _mm512_xor_si512( state1, in[1] );
       out[2] = _mm512_xor_si512( state2, in[2] );

-       //M[row*][col] = M[row*][col] XOR rotW(rand)
-       t0 = _mm512_permutex_epi64( state0, 0x93 );
-       t1 = _mm512_permutex_epi64( state1, 0x93 );
-       t2 = _mm512_permutex_epi64( state2, 0x93 );
+      {
+        register __m512i t0, t1, t2;
+       
+        //M[row*][col] = M[row*][col] XOR rotW(rand)
+        t0 = _mm512_permutex_epi64( state0, 0x93 );
+        t1 = _mm512_permutex_epi64( state1, 0x93 );
+        t2 = _mm512_permutex_epi64( state2, 0x93 );

-       inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
-       inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
-       inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+        inout[0] = _mm512_xor_si512( inout[0],
+                                 _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+        inout[1] = _mm512_xor_si512( inout[1],
+                                 _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+        inout[2] = _mm512_xor_si512( inout[2],
+                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+      }

-
-       //Inputs: next column (i.e., next block in sequence)
-       in    += BLOCK_LEN_M256I;
-       inout += BLOCK_LEN_M256I;
-       //Output: goes to previous column
-       out   -= BLOCK_LEN_M256I;
+      //Inputs: next column (i.e., next block in sequence)
+      in    += BLOCK_LEN_M256I;
+      inout += BLOCK_LEN_M256I;
+      //Output: goes to previous column
+      out   -= BLOCK_LEN_M256I;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -244,12 +246,235 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-// big ugly workaound for pointer aliasing, use a union of pointers.
-// Access matrix using m512i for in and out, m256i for inout
+// reduced duplex row has three version depending on rows inout.
+// If they are the same the fastest version can be used, equivalent to 
+// linear version.
+// If either rowinout overlaps with rowout the slowest version is used,
+// to refresh local data after overwriting rowout.
+// Otherwise the normal version is used, slower than unified, faster than
+// overlap.
+//
+// The likelyhood of each case depends on the number of rows. More rows
+// means unified and overlap are both less likely.
+// Unified has a 1 in Nrows chances,
+// Overlap has 2 in Nrows chance reduced to 1 in Nrows because if both
+// overlap it's unified.
+// As a result normal is Nrows-2 / Nrows.
+// for 4 rows: 1 unified, 2 overlap, 1 normal.
+// for 8 rows: 1 unified, 2 overlap, 56 normal.

-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
-                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+static inline void reducedDuplexRow_2way_normal( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m512i *inout0 = (__m512i*)rowInOut0;
+   __m512i *inout1 = (__m512i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   register __m512i io0, io1, io2;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+   for ( i = 0; i < nCols; i++ )
+   {
+     //Absorbing "M[prev] [+] M[row*]"
+     io0 = _mm512_mask_blend_epi64( 0xf0,
+                                    _mm512_load_si512( (__m512i*)inout0 ),
+                                    _mm512_load_si512( (__m512i*)inout1 ) );
+     io1 = _mm512_mask_blend_epi64( 0xf0,
+                                    _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                    _mm512_load_si512( (__m512i*)inout1 +1 ) );
+     io2 = _mm512_mask_blend_epi64( 0xf0,
+                                    _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                    _mm512_load_si512( (__m512i*)inout1 +2 ) );
+
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2 ) );
+
+     //Applies the reduced-round transformation f to the sponge's state
+     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+     {
+       register __m512i t0, t1, t2;
+
+       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       io0 = _mm512_xor_si512( io0, _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+       io1 = _mm512_xor_si512( io1, _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+       io2 = _mm512_xor_si512( io2, _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+       //M[rowOut][col] = M[rowOut][col] XOR rand
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
+     }
+
+     _mm512_mask_store_epi64( inout0,    0x0f, io0 );
+     _mm512_mask_store_epi64( inout1,    0xf0, io0 );
+     _mm512_mask_store_epi64( inout0 +1, 0x0f, io1 );
+     _mm512_mask_store_epi64( inout1 +1, 0xf0, io1 );
+     _mm512_mask_store_epi64( inout0 +2, 0x0f, io2 );
+     _mm512_mask_store_epi64( inout1 +2, 0xf0, io2 );
+
+      //Goes to next block
+      in     += BLOCK_LEN_M256I;
+      inout0 += BLOCK_LEN_M256I;
+      inout1 += BLOCK_LEN_M256I;
+      out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m512i *inout0 = (__m512i*)rowInOut0;
+   __m512i *inout1 = (__m512i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+//   inout_ovly io;
+   ovly_512 io0, io1, io2;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+    
+   for ( i = 0; i < nCols; i++ )
+   {
+     //Absorbing "M[prev] [+] M[row*]"
+     io0.v512 = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 ),
+                                  _mm512_load_si512( (__m512i*)inout1 ) );
+     io1.v512 = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
+     io2.v512 = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
+
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
+     
+/* 
+     io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 ),
+                                  _mm512_load_si512( (__m512i*)inout1 ) );
+     io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
+     io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
+
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
+*/
+
+     //Applies the reduced-round transformation f to the sponge's state
+     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+     {
+       __m512i t0, t1, t2;
+
+       //M[rowOut][col] = M[rowOut][col] XOR rand
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
+
+       // if out is the same row as inout, update with new data.
+       if ( rowOut == rowInOut0 )
+       {
+          io0.v512 = _mm512_mask_blend_epi64( 0x0f, io0.v512, out[0] );
+          io1.v512 = _mm512_mask_blend_epi64( 0x0f, io1.v512, out[1] );
+          io2.v512 = _mm512_mask_blend_epi64( 0x0f, io2.v512, out[2] );
+
+       }
+       if ( rowOut == rowInOut1 )
+       {
+          io0.v512 = _mm512_mask_blend_epi64( 0xf0, io0.v512, out[0] );
+          io1.v512 = _mm512_mask_blend_epi64( 0xf0, io1.v512, out[1] );
+          io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
+       }
+
+/*
+       if ( rowOut == rowInOut0 )
+       {
+          io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
+          io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
+          io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
+
+       }
+       if ( rowOut == rowInOut1 )
+       {
+          io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
+          io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
+          io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
+       }
+*/
+
+       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       io0.v512 = _mm512_xor_si512( io0.v512,
+                                 _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+       io1.v512 = _mm512_xor_si512( io1.v512,
+                                 _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+       io2.v512 = _mm512_xor_si512( io2.v512,
+                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+     }
+
+      casti_m256i( inout0, 0 ) = io0.v256lo;
+      casti_m256i( inout1, 1 ) = io0.v256hi;
+      casti_m256i( inout0, 2 ) = io1.v256lo;
+      casti_m256i( inout1, 3 ) = io1.v256hi;
+      casti_m256i( inout0, 4 ) = io2.v256lo;
+      casti_m256i( inout1, 5 ) = io2.v256hi;
+/*     
+     _mm512_mask_store_epi64( inout0,    0x0f, io.v512[0] );
+     _mm512_mask_store_epi64( inout1,    0xf0, io.v512[0] );
+     _mm512_mask_store_epi64( inout0 +1, 0x0f, io.v512[1] );
+     _mm512_mask_store_epi64( inout1 +1, 0xf0, io.v512[1] );
+     _mm512_mask_store_epi64( inout0 +2, 0x0f, io.v512[2] );
+     _mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] );
+*/
+      //Goes to next block
+      in     += BLOCK_LEN_M256I;
+      inout0 += BLOCK_LEN_M256I;
+      inout1 += BLOCK_LEN_M256I;
+      out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+static inline void reducedDuplexRow_2way_overlap_X( uint64_t *State,
+                    uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
+                    uint64_t *rowOut, uint64_t nCols)
 {
   int i;
   register __m512i state0, state1, state2, state3;
@@ -257,30 +482,14 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
   __m256i *inout0 = (__m256i*)rowInOut0;
   __m256i *inout1 = (__m256i*)rowInOut1;
   __m512i *out = (__m512i*)rowOut;
-   __m512i io[3];
-   povly inout;
-   inout.v512 = &io[0];
-    __m512i t0, t1, t2;
+   inout_ovly inout;
+   __m512i t0, t1, t2;

   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
-    
-    _mm_prefetch( in,     _MM_HINT_T0 );
-    _mm_prefetch( inout0,     _MM_HINT_T0 );
-    _mm_prefetch( inout1,     _MM_HINT_T0 );
-    _mm_prefetch( in     + 2, _MM_HINT_T0 );
-    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
-    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in     + 4, _MM_HINT_T0 );
-    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
-    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in     + 6, _MM_HINT_T0 );
-    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
-    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );

-    
    for ( i = 0; i < nCols; i++ )
    {

@@ -311,15 +520,15 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
      // if inout is the same row as out it was just overwritten, reload.
      if ( rowOut == rowInOut0 )
      {
-         inout.v256[0] = inout0[0];
-         inout.v256[2] = inout0[2];
-         inout.v256[4] = inout0[4];
+         inout.v256[0] = ( (__m256i*)out )[0];
+         inout.v256[2] = ( (__m256i*)out )[2];
+         inout.v256[4] = ( (__m256i*)out )[4];
      }
      if ( rowOut == rowInOut1 )
      {
-         inout.v256[1] = inout1[1];
-         inout.v256[3] = inout1[3];
-         inout.v256[5] = inout1[5];
+         inout.v256[1] = ( (__m256i*)out )[1];
+         inout.v256[3] = ( (__m256i*)out )[3];
+         inout.v256[5] = ( (__m256i*)out )[5];
      }

      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
@@ -328,12 +537,12 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
      t2 = _mm512_permutex_epi64( state2, 0x93 );

      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
-                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+                                   _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
-                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+                                   _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
-                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
-      
+                                   _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
      inout0[0] = inout.v256[0];
      inout1[1] = inout.v256[1];
      inout0[2] = inout.v256[2];
@@ -354,4 +563,108 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
   _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

+// rowInOut0 == rowInOut1, fastest, least likely: 1 / nrows
+static inline void reducedDuplexRow_2way_unified( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0,
+                            uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m512i *inout = (__m512i*)rowInOut0;
+   __m512i *out = (__m512i*)rowOut;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+   for ( i = 0; i < nCols; i++ )
+   {
+     //Absorbing "M[prev] [+] M[row*]"
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], inout[0] ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], inout[1] ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], inout[2] ) );
+
+     //Applies the reduced-round transformation f to the sponge's state
+     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+     {
+       register __m512i t0, t1, t2;
+
+       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       inout[0] = _mm512_xor_si512( inout[0],
+                                    _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+       inout[1] = _mm512_xor_si512( inout[1],
+                                    _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+       inout[2] = _mm512_xor_si512( inout[2],
+                                    _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
+
+     }
+
+     //Goes to next block
+     in    += BLOCK_LEN_M256I;
+     inout += BLOCK_LEN_M256I;
+     out   += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// Multi level specialization.
+// There are three cases that need to be handled:
+// unified: inout data is contiguous, fastest, unlikely.
+// normal: inout data is not contiguous with no overlap with out, likely. 
+// overlap: inout data is not contiguous and one lane overlaps with out
+//          slowest, unlikely.
+//
+// In adition different algos prefer different coding. x25x and x22i prefer
+// 256 bit memory acceses to handle the diverged data while all other
+// algos prefer 512 bit memory accesses with masking and blending.
+
+ 
+//  Wrapper
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                                   uint64_t *rowInOut0, uint64_t *rowInOut1,
+                                   uint64_t *rowOut, uint64_t nCols )
+{
+  if ( rowInOut0 == rowInOut1 )
+     reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
+  else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+     reducedDuplexRow_2way_overlap( State, rowIn, rowInOut0, rowInOut1,
+                                    rowOut, nCols );
+  else
+     reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
+                                   rowOut, nCols );
+}
+
+inline void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
+                                     uint64_t *rowInOut0, uint64_t *rowInOut1,
+                                     uint64_t *rowOut, uint64_t nCols )
+{
+   if ( rowInOut0 == rowInOut1 )
+      reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
+   else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+   {
+      asm volatile ( "nop" );  // Prevent GCC from optimizing
+      reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1,
+                                       rowOut, nCols );
+   }
+   else
+      reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
+                                    rowOut, nCols );
+}
+
+
 #endif // AVX512
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -203,13 +203,24 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-union _povly
+union _ovly_512
 {
-   __m512i *v512;
-   __m256i *v256;
-   uint64_t *u64;
+  __m512i v512;
+  struct
+  {
+     __m256i v256lo;
+     __m256i v256hi;
+  };
 };
-typedef union _povly povly;
+typedef union _ovly_512 ovly_512;
+
+
+union _inout_ovly
+{
+   __m512i v512[3];
+   __m256i v256[6];
+};
+typedef union _inout_ovly inout_ovly;

 //---- Housekeeping
 void initState_2way( uint64_t State[/*16*/] );
@@ -234,6 +245,10 @@ void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
                            uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols);

+void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
+                              uint64_t *rowInOut0, uint64_t *rowInOut1,
+                              uint64_t *rowOut, uint64_t nCols);
+
 #endif


--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -149,7 +149,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    char data_str[161], hash_str[65], target_str[65];
    //uint8_t *bdata = 0;
    uint8_t bdata[8192] __attribute__ ((aligned (64)));
-    int rc = 0, i, digits;
+    int i, digits;
    int bytes;
    size_t p = sizeof(unsigned long), a = 64/p, b = 32/p;

@@ -267,47 +267,41 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
        }

-        const unsigned char *hash_ = (const unsigned char *)hash;
-        const unsigned char *target_ = (const unsigned char *)ptarget;
-        for ( i = 31; i >= 0; i-- )
+
+        if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) 
+             && !opt_benchmark ) )
+
+
+//        if ( unlikely( hash[7] <= ptarget[7] ) )
+//        if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) )        
        {
-	        if ( hash_[i] != target_[i] )
+           if ( opt_debug )
           {
-		        rc = hash_[i] < target_[i];
-		        break;
-	        }
-        }
-        if ( unlikely(rc) )
-        {
-            if ( opt_debug )
-            {
-                bin2hex(hash_str, (unsigned char *)hash, 32);
-                bin2hex(target_str, (unsigned char *)ptarget, 32);
-                bin2hex(data_str, (unsigned char *)data, 80);
-                applog(LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata   %s\nhash   %s\ntarget %s", thr_id, 
-                    data_str,
-                    hash_str,
-                    target_str);
+                bin2hex( hash_str, (unsigned char *)hash, 32 );
+                bin2hex( target_str, (unsigned char *)ptarget, 32 );
+                bin2hex( data_str, (unsigned char *)data, 80 );
+                applog( LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata   %s\nhash   %s\ntarget %s",
+                      thr_id, data_str, hash_str, target_str );
            }
            pdata[19] = data[19];
            submit_solution( work, hash, mythr );
        }
-    } while (n < max_nonce && !work_restart[thr_id].restart);
+    } while ( n < max_nonce && !work_restart[thr_id].restart );

     pdata[19] = n;

-     mpf_set_prec_raw(magifpi, prec0);
-     mpf_set_prec_raw(magifpi0, prec0);
-     mpf_set_prec_raw(mptmp, prec0);
-     mpf_set_prec_raw(mpt1, prec0);
-     mpf_set_prec_raw(mpt2, prec0);
-     mpf_clear(magifpi);
-     mpf_clear(magifpi0);
-     mpf_clear(mpten);
-     mpf_clear(mptmp);
-     mpf_clear(mpt1);
-     mpf_clear(mpt2);
-     mpz_clears(magipi, magisw, product, bns0, bns1, NULL);
+     mpf_set_prec_raw( magifpi, prec0 );
+     mpf_set_prec_raw( magifpi0, prec0 );
+     mpf_set_prec_raw( mptmp, prec0 );
+     mpf_set_prec_raw( mpt1, prec0 );
+     mpf_set_prec_raw( mpt2, prec0 );
+     mpf_clear( magifpi );
+     mpf_clear( magifpi0 );
+     mpf_clear( mpten );
+     mpf_clear( mptmp );
+     mpf_clear( mpt1 );
+     mpf_clear( mpt2 );
+     mpz_clears( magipi, magisw, product, bns0, bns1, NULL );

    *hashes_done = n - first_nonce + 1;
    return 0;
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -102,7 +102,7 @@ int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
        nist5hash_8way( hash, vdata );

        for ( int lane = 0; lane < 8; lane++ )
-        if ( hash7[ lane<<1 ] < Htarg )
+        if ( hash7[ lane<<1 ] <= Htarg )
        {
           extr_lane_8x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
@@ -133,7 +133,7 @@ void nist5hash_4way( void *out, const void *input )
     keccak512_4way_context ctx_keccak;

     blake512_4way_init( &ctx_blake );
-     blake512_4way( &ctx_blake, input, 80 );
+     blake512_4way_update( &ctx_blake, input, 80 );
     blake512_4way_close( &ctx_blake, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -154,15 +154,15 @@ void nist5hash_4way( void *out, const void *input )
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     jh512_4way_init( &ctx_jh );
-     jh512_4way( &ctx_jh, vhash, 64 );
+     jh512_4way_update( &ctx_jh, vhash, 64 );
     jh512_4way_close( &ctx_jh, vhash );

     keccak512_4way_init( &ctx_keccak );
-     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_update( &ctx_keccak, vhash, 64 );
     keccak512_4way_close( &ctx_keccak, vhash );

     skein512_4way_init( &ctx_skein );
-     skein512_4way( &ctx_skein, vhash, 64 );
+     skein512_4way_update( &ctx_skein, vhash, 64 );
     skein512_4way_close( &ctx_skein, out );
 }

@@ -190,7 +190,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
        nist5hash_4way( hash, vdata );

        for ( int lane = 0; lane < 4; lane++ )
-        if ( hash7[ lane<<1 ] < Htarg )
+        if ( hash7[ lane<<1 ] <= Htarg )
        {
           extr_lane_4x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	88f81fda0b	v3.11.7	2020-01-26 04:33:39 -05:00
Jay D Dee	103e6ad36c	v3.11.6	2020-01-23 00:11:08 -05:00
Jay D Dee	1a7a573675	v3.11.5	2020-01-18 15:14:27 -05:00
Jay D Dee	70089d1224	v3.11.2	2020-01-08 14:44:47 -05:00
Jay D Dee	3572cb53c4	v3.11.0	2020-01-02 23:54:08 -05:00