v3.11.3

2025-09-17 23:44:27 +00:00 · 2020-01-10 20:37:47 -05:00
120 changed files with 13025 additions and 6026 deletions
--- a/3
+++ b/3
@@ -33,6 +33,3 @@ Jay D Dee
 xcouiz@gmail.com

 Cryply
-
-Colin Percival
-Alexander Peslyak
--- a/Makefile.am
+++ b/Makefile.am
@@ -80,6 +80,7 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
+  algo/cubehash/sph_cubehash.c \
  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
@@ -120,8 +121,6 @@ cpuminer_SOURCES = \
  algo/keccak/keccak-hash-4way.c \
  algo/keccak/keccak-4way.c\
  algo/keccak/keccak-gate.c \
-  algo/keccak/sha3d-4way.c \
-  algo/keccak/sha3d.c \
  algo/lanehash/lane.c \
  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
@@ -181,7 +180,6 @@ cpuminer_SOURCES = \
  algo/sha/sph_sha2big.c \
  algo/sha/sha256-hash-4way.c \
  algo/sha/sha512-hash-4way.c \
-  algo/sha/hmac-sha256-hash.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
  algo/sha/sha256t-4way.c \
@@ -294,11 +292,12 @@ cpuminer_SOURCES = \
  algo/x22/x25x.c \
  algo/x22/x25x-4way.c \
  algo/yescrypt/yescrypt.c \
+  algo/yescrypt/sha256_Y.c \
  algo/yescrypt/yescrypt-best.c \
  algo/yespower/yespower-gate.c \
  algo/yespower/yespower-blake2b.c \
  algo/yespower/crypto/blake2b-yp.c \
-  algo/yespower/yescrypt-r8g.c \
+  algo/yespower/sha256_p.c \
  algo/yespower/yespower-opt.c

 disable_flags =
--- a/README.md
+++ b/README.md
@@ -97,10 +97,10 @@ Supported Algorithms
                          qubit         Qubit
                          scrypt        scrypt(1024, 1, 1) (default)
                          scrypt:N      scrypt(N, 1, 1)
+                          scryptjane:nf
                          sha256d       Double SHA-256
                          sha256q       Quad SHA-256, Pyrite (PYE)
                          sha256t       Triple SHA-256, Onecoin (OC)
-                          sha3d         Double keccak256 (BSHA3)
                          shavite3      Shavite3
                          skein         Skein+Sha (Skeincoin)
                          skein2        Double Skein (Woodcoin)
@@ -134,7 +134,6 @@ Supported Algorithms
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
-                          yescryptr8g   Koto (KOTO)
                          yescryptr16   Eli
                          yescryptr32   WAVI
                          yespower      Cryply
--- a/83
+++ b/83
@@ -33,92 +33,9 @@ supported.
 64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
 are not supported. FreeBSD YMMV.

-Reporting bugs
--------------
-
-Bugs can be reported by sending am email to JayDDee246@gmail.com or opening
-an issue in git: https://github.com/JayDDee/cpuminer-opt/issues
-
-Please include the following information:
-
-1. CPU model, operating system, cpuminer-opt version (must be latest),
-   binary file for Windows, changes to default build procedure for Linux.
-
-2. Exact comand line (except user and pw) and intial output showing
-   the above requested info.
-
-3. Additional program output showing any error messages or other
-   pertinent data.
-
-4. A clear description of the problem including history, scope,
-   persistence or intermittance, and reproduceability. 
-
-In simpler terms:
-
-What is it doing?
-What should it be doing instead?
-Did it work in a previous release?
-Does it happen for all algos? All pools? All options? Solo?
-Does it happen all the time?
-If not what makes it happen or not happen? 
-
 Change Log
 ----------

-v3.11.8
-
-Fixed network hashrate showing incorrect data, should be close now.
-
-Fixed compile errors when using GCC 10 with default flag -fno-common.
-
-Faster x16r, x16rv2, x16rt, x16s, x21s, veil, hex with midstate prehash.
-
-Decoupled sapling usage from block version 5 in yescryptr8g.
-
-More detailed data reporting for low difficulty rejected shares.
-
-v3.11.7
-
-Added yescryptr8g algo for KOTO, including support for block version 5.
-
-Added sha3d algo for BSHA3.
-
-Removed memcmp and clean_job checks from get_new_work, now only check job_id.
-
-Small improvement to sha512 and sha256 parallel implementations that don't
-use SHA.
-
-v3.11.6
-
-Fixed CPU temperature regression from v3.11.5.
-
-More improvements to share log. More compact, highlight incremented counter,
-block height when solved, job id when stale.
-
-v3.11.5
-
-Fixed AVX512 detection that could cause compilation errors on CPUs
-without AVX512.
-
-Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block
-is solved.
-Added share counter to share submitited & accepted logs
-Added job id to share submitted log.
-Share submitted log is no longer highlighted blue, there was too much blue.
-
-Another CPU temperature fix for Linux.
-
-Added bug reporting tips to RELEASE NOTES.
-
-v3.11.4
-
-Fixed scrypt segfault since v3.9.9.1.
-
-Stale shares counted and reported seperately from other rejected shares.
-
-Display of counters for solved blocks, rejects, stale shares suppressed in
-periodic summary when zero.
-
 v3.11.3

 Fixed x12 AVX2 again.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -209,7 +209,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_SHA256D:       register_sha256d_algo       ( gate ); break;
    case ALGO_SHA256Q:       register_sha256q_algo       ( gate ); break;
    case ALGO_SHA256T:       register_sha256t_algo       ( gate ); break;
-    case ALGO_SHA3D:         register_sha3d_algo         ( gate ); break;
    case ALGO_SHAVITE3:      register_shavite_algo       ( gate ); break;
    case ALGO_SKEIN:         register_skein_algo         ( gate ); break;
    case ALGO_SKEIN2:        register_skein2_algo        ( gate ); break;
@@ -248,7 +247,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
 */
    case ALGO_YESCRYPT:      register_yescrypt_algo      ( gate ); break;
    case ALGO_YESCRYPTR8:    register_yescryptr8_algo    ( gate ); break;
-    case ALGO_YESCRYPTR8G:   register_yescryptr8g_algo   ( gate ); break;
    case ALGO_YESCRYPTR16:   register_yescryptr16_algo   ( gate ); break;
    case ALGO_YESCRYPTR32:   register_yescryptr32_algo   ( gate ); break;
    case ALGO_YESPOWER:      register_yespower_algo      ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -121,55 +121,54 @@ void ( *hash_suw ) ( void*, const void* );

 // Allocate thread local buffers and other initialization specific to miner
 // threads.
-bool ( *miner_thread_init )     ( int );
+bool ( *miner_thread_init )      ( int );

 // Generate global blockheader from stratum data.
-void ( *stratum_gen_work )      ( struct stratum_ctx*, struct work* );
+void ( *stratum_gen_work )       ( struct stratum_ctx*, struct work* );

 // Get thread local copy of blockheader with unique nonce.
-void ( *get_new_work )          ( struct work*, struct work*, int, uint32_t* );
+void ( *get_new_work )           ( struct work*, struct work*, int, uint32_t*,
+                                   bool );

 // Return pointer to nonce in blockheader.
-uint32_t *( *get_nonceptr )     ( uint32_t* );
+uint32_t *( *get_nonceptr )      ( uint32_t* );

 // Decode getwork blockheader
-bool ( *work_decode )           ( const json_t*, struct work* );
+bool ( *work_decode )            ( const json_t*, struct work* );

 // Extra getwork data
-void ( *decode_extra_data )     ( struct work*, uint64_t* );
+void ( *decode_extra_data )      ( struct work*, uint64_t* );

-bool ( *submit_getwork_result ) ( CURL*, struct work* );
+bool ( *submit_getwork_result )  ( CURL*, struct work* );

-void ( *gen_merkle_root )       ( char*, struct stratum_ctx* );
+void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );

 // Increment extranonce
-void ( *build_extraheader )     ( struct work*, struct stratum_ctx* );
-
-void ( *build_block_header )    ( struct work*, uint32_t, uint32_t*,
-	                                uint32_t*, uint32_t, uint32_t,
-                                   unsigned char* );
+void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );

+void ( *build_block_header )     ( struct work*, uint32_t, uint32_t*,
+	                                uint32_t*, uint32_t, uint32_t );
 // Build mining.submit message
-void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
+void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );

-char* ( *malloc_txs_request )   ( struct work* );
+char* ( *malloc_txs_request )    ( struct work* );

 // Big or little
-void ( *set_work_data_endian )  ( struct work* );
+void ( *set_work_data_endian )   ( struct work* );

-double ( *calc_network_diff )   ( struct work* );
+double ( *calc_network_diff )    ( struct work* );

 // Wait for first work
-bool ( *ready_to_mine )         ( struct work*, struct stratum_ctx*, int );
+bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );

 // Diverge mining threads
-bool ( *do_this_thread )        ( int );
+bool ( *do_this_thread )         ( int );

 // After do_this_thread
-void ( *resync_threads )        ( struct work* );
+void ( *resync_threads )         ( struct work* );

-json_t* (*longpoll_rpc_call)      ( CURL*, int*, char* );
-bool ( *stratum_handle_response ) ( json_t* );
+json_t* (*longpoll_rpc_call)     ( CURL*, int*, char* );
+bool ( *stratum_handle_response )( json_t* );
 set_t optimizations;
 int  ( *get_work_data_size )     ();
 int  ntime_index;
@@ -226,7 +225,7 @@ uint32_t *std_get_nonceptr( uint32_t *work_data );
 uint32_t *jr2_get_nonceptr( uint32_t *work_data );

 void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
-                       uint32_t* end_nonce_ptr );
+                       uint32_t* end_nonce_ptr, bool clean_job );
 void jr2_get_new_work( struct work *work, struct work *g_work, int thr_id,
                       uint32_t* end_nonce_ptr );

@@ -257,8 +256,7 @@ double std_calc_network_diff( struct work *work );

 void std_build_block_header( struct work* g_work, uint32_t version,
 	                          uint32_t *prevhash,  uint32_t *merkle_root,
-   	                       uint32_t ntime,      uint32_t nbits,
-                             unsigned char *final_sapling_hash );
+   	                       uint32_t ntime,      uint32_t nbits );

 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -13,7 +13,7 @@ void blakehash_4way(void *state, const void *input)
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r14_4way_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
+     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -36,7 +36,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -37,6 +37,8 @@
 #ifndef __BLAKE_HASH_4WAY__
 #define __BLAKE_HASH_4WAY__ 1

+//#ifdef __SSE4_2__
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -49,41 +51,46 @@ extern "C"{

 #define SPH_SIZE_blake512   512

-//////////////////////////
-//
-//   Blake-256 4 way SSE2
+// With SSE4.2 only Blake-256 4 way is available.
+// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
+
+// Blake-256 4 way

 typedef struct {
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
+//   __m128i buf[16] __attribute__ ((aligned (64)));
+//   __m128i H[8];
+//   __m128i S[4];    
   size_t ptr;
   uint32_t T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context __attribute__ ((aligned (64)));

-// Default, 14 rounds, blake, decred
+// Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
 void blake256_4way_update(void *ctx, const void *data, size_t len);
+#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
 void blake256r14_4way_update(void *cc, const void *data, size_t len);
+#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
 void blake256r8_4way_update(void *cc, const void *data, size_t len);
+#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

-//////////////////////////
-//
-//   Blake-256 8 way AVX2
+// Blake-256 8 way

 typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
@@ -97,6 +104,7 @@ typedef struct {
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
+//#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);

 // 14 rounds, blake, decred
@@ -109,9 +117,10 @@ void blake256r14_8way_close(void *cc, void *dst);
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
 void blake256r8_8way_update(void *cc, const void *data, size_t len);
+#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);

-// Blake-512 4 way AVX2
+// Blake-512 4 way

 typedef struct {
   __m256i buf[16];
@@ -125,15 +134,14 @@ typedef blake_4way_big_context blake512_4way_context;

 void blake512_4way_init( blake_4way_big_context *sc );
 void blake512_4way_update( void *cc, const void *data, size_t len );
+#define blake512_4way blake512_4way_update
 void blake512_4way_close( void *cc, void *dst );
-void blake512_4way_full( blake_4way_big_context *sc, void * dst,
-                         const void *data, size_t len );
+void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-////////////////////////////
-//
-//   Blake-256 16 way AVX512
+//Blake-256 16 way

 typedef struct {
   __m512i buf[16];
@@ -161,9 +169,8 @@ void blake256r8_16way_init(void *cc);
 void blake256r8_16way_update(void *cc, const void *data, size_t len);
 void blake256r8_16way_close(void *cc, void *dst);

-////////////////////////////
-//
-//// Blake-512 8 way AVX512
+
+// Blake-512 8 way

 typedef struct {
   __m512i buf[16];
@@ -178,10 +185,12 @@ typedef blake_8way_big_context blake512_8way_context;
 void blake512_8way_init( blake_8way_big_context *sc );
 void blake512_8way_update( void *cc, const void *data, size_t len );
 void blake512_8way_close( void *cc, void *dst );
-void blake512_8way_full( blake_8way_big_context *sc, void * dst,
-                        const void *data, size_t len );
+void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                      void *dst );

 #endif  // AVX512
+
+
 #endif  // AVX2

 #ifdef __cplusplus
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -267,22 +267,22 @@ static const sph_u64 CB[16] = {
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n

-#define CB0   0x243F6A8885A308D3
-#define CB1   0x13198A2E03707344
-#define CB2   0xA4093822299F31D0
-#define CB3   0x082EFA98EC4E6C89
-#define CB4   0x452821E638D01377
-#define CB5   0xBE5466CF34E90C6C
-#define CB6   0xC0AC29B7C97C50DD
-#define CB7   0x3F84D5B5B5470917
-#define CB8   0x9216D5D98979FB1B
-#define CB9   0xD1310BA698DFB5AC
-#define CBA   0x2FFD72DBD01ADFB7
-#define CBB   0xB8E1AFED6A267E96
-#define CBC   0xBA7C9045F12C7F99
-#define CBD   0x24A19947B3916CF7
-#define CBE   0x0801F2E2858EFC16
-#define CBF   0x636920D871574E69
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)

 #define READ_STATE64(state)   do { \
      H0 = (state)->H[0]; \
@@ -349,9 +349,9 @@ static const sph_u64 CB[16] = {
 #define DECL_STATE64_8WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m512i S0, S1, S2, S3; \
-   uint64_t T0, T1;
+   sph_u64 T0, T1;

-#define COMPRESS64_8WAY( buf )   do \
+#define COMPRESS64_8WAY   do \
 { \
  __m512i M0, M1, M2, M3, M4, M5, M6, M7; \
  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -424,84 +424,6 @@ static const sph_u64 CB[16] = {
  H7 = mm512_xor4( VF, V7, S3, H7 ); \
 } while (0)

-void blake512_8way_compress( blake_8way_big_context *sc )
-{ 
-  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
-  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
-  __m512i V0, V1, V2, V3, V4, V5, V6, V7;
-  __m512i V8, V9, VA, VB, VC, VD, VE, VF;
-  __m512i shuf_bswap64;
-
-  V0 = sc->H[0];
-  V1 = sc->H[1];
-  V2 = sc->H[2];
-  V3 = sc->H[3];
-  V4 = sc->H[4];
-  V5 = sc->H[5];
-  V6 = sc->H[6];
-  V7 = sc->H[7];
-  V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
-  V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
-  VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
-  VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
-  VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
-                            m512_const1_64( CB4 ) );
-  VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
-                            m512_const1_64( CB5 ) );
-  VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
-                            m512_const1_64( CB6 ) );
-  VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
-                            m512_const1_64( CB7 ) );
-
-  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
-                                0x28292a2b2c2d2e2f, 0x2021222324252627,
-                                0x18191a1b1c1d1e1f, 0x1011121314151617,
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
-
-  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
-
-  ROUND_B_8WAY(0);
-  ROUND_B_8WAY(1);
-  ROUND_B_8WAY(2);
-  ROUND_B_8WAY(3);
-  ROUND_B_8WAY(4);
-  ROUND_B_8WAY(5);
-  ROUND_B_8WAY(6);
-  ROUND_B_8WAY(7);
-  ROUND_B_8WAY(8);
-  ROUND_B_8WAY(9);
-  ROUND_B_8WAY(0);
-  ROUND_B_8WAY(1);
-  ROUND_B_8WAY(2);
-  ROUND_B_8WAY(3);
-  ROUND_B_8WAY(4);
-  ROUND_B_8WAY(5);
-
-  sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
-  sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
-  sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
-  sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
-  sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
-  sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
-  sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
-  sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
-}
-
 void blake512_8way_init( blake_8way_big_context *sc )
 {
   __m512i zero = m512_zero;
@@ -533,43 +455,39 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )

   const int buf_size = 128;  //  sizeof/8

-// 64, 80 bytes: 1st pass copy data. 2nd pass copy padding and compress.   
-// 128 bytes: 1st pass copy data, compress. 2nd pass copy padding, compress.
-   
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-      memcpy_512( buf + (ptr>>3), vdata, len>>3 );
-      ptr += len;
-      sc->ptr = ptr;
-      return;
+   memcpy_512( buf + (ptr>>3), vdata, len>>3 );
+   ptr += len;
+   sc->ptr = ptr;
+   return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-      size_t clen;
+   size_t clen;

-      clen = buf_size - ptr;
-      if ( clen > len )
+   clen = buf_size - ptr;
+   if ( clen > len )
      clen = len;
-      memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
-      ptr += clen;
-      vdata = vdata + (clen>>3);
-      len -= clen;
-      if ( ptr == buf_size )
-      {
-         if ( ( T0 = T0 + 1024 ) < 1024 )
-            T1 = T1 + 1;
-         COMPRESS64_8WAY( buf );
-         ptr = 0;
-      }
+   memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
+   ptr += clen;
+   vdata = vdata + (clen>>3);
+   len -= clen;
+   if ( ptr == buf_size )
+        {
+      if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
+         T1 = SPH_T64(T1 + 1);
+      COMPRESS64_8WAY;
+      ptr = 0;
+   }
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
-
-   }
+}

 static void
 blake64_8way_close( blake_8way_big_context *sc, void *dst )
@@ -577,22 +495,26 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   __m512i buf[16];
   size_t ptr;
   unsigned bit_len;
-   uint64_t th, tl;
+//   uint64_t z, zz;
+   sph_u64 th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
+//   z = 0x80 >> n;
+//   zz = ((ub & -z) | z) & 0xFF;
+//   buf[ptr>>3] = _mm512_set1_epi64( zz );
   buf[ptr>>3] = m512_const1_64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
   {
-   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
-   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+   sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
   }
   else if ( sc->T0 == 0 )
   {
-   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
-   sc->T1 = sc->T1 - 1;
+   sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
+   sc->T1 = SPH_T64(sc->T1 - 1);
   }
   else
   {
@@ -613,8 +535,8 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
       memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
-       sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
-       sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_512( buf, 112>>3 );
       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
       buf[112>>3] = m512_const1_64( bswap_64( th ) );
@@ -625,79 +547,6 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }

-// init, update & close
-void blake512_8way_full( blake_8way_big_context *sc, void * dst, 
-                        const void *data, size_t len )
-{
-   
-// init
-
-   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
-   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
-   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
-   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
-   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
-
-   casti_m512i( sc->S, 0 ) = m512_zero;
-   casti_m512i( sc->S, 1 ) = m512_zero;
-   casti_m512i( sc->S, 2 ) = m512_zero;
-   casti_m512i( sc->S, 3 ) = m512_zero;
-
-   sc->T0 = sc->T1 = 0;
-   sc->ptr = 0;
-
-// update
-
-   memcpy_512( sc->buf, (__m512i*)data, len>>3 );
-   sc->ptr = len;
-   if ( len == 128 )
-   {
-      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
-            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress( sc );
-      sc->ptr = 0;
-   }
-
-// close
-
-   size_t ptr64 = sc->ptr >> 3;
-   unsigned bit_len;
-   uint64_t th, tl;
-
-   bit_len = sc->ptr << 3;
-   sc->buf[ptr64] = m512_const1_64( 0x80 );
-   tl = sc->T0 + bit_len;
-   th = sc->T1;
-
-   if ( ptr64 == 0 )
-   {
-   sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
-   sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
-   }
-   else if ( sc->T0 == 0 )
-   {
-   sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
-   sc->T1 = sc->T1 - 1;
-   }
-   else
-      sc->T0 -= 1024 - bit_len;
-
-   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
-   sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
-   sc->buf[14] = m512_const1_64( bswap_64( th ) );
-   sc->buf[15] = m512_const1_64( bswap_64( tl ) );
-
-   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
-       sc->T1 = sc->T1 + 1;
-
-   blake512_8way_compress( sc );
-   
-   mm512_block_bswap_64( (__m512i*)dst, sc->H );
-}
-   
 void
 blake512_8way_update(void *cc, const void *data, size_t len)
 {
@@ -706,6 +555,12 @@ blake512_8way_update(void *cc, const void *data, size_t len)

 void
 blake512_8way_close(void *cc, void *dst)
+{
+   blake512_8way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
   blake64_8way_close(cc, dst);
 }
@@ -741,7 +596,7 @@ blake512_8way_close(void *cc, void *dst)
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
-	uint64_t T0, T1;
+	sph_u64 T0, T1;

 #define COMPRESS64_4WAY   do \
 { \
@@ -815,81 +670,6 @@ blake512_8way_close(void *cc, void *dst)
 } while (0)


-void blake512_4way_compress( blake_4way_big_context *sc )
-{
-  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
-  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
-  __m256i V0, V1, V2, V3, V4, V5, V6, V7;
-  __m256i V8, V9, VA, VB, VC, VD, VE, VF;
-  __m256i shuf_bswap64;
-
-  V0 = sc->H[0];
-  V1 = sc->H[1];
-  V2 = sc->H[2];
-  V3 = sc->H[3];
-  V4 = sc->H[4];
-  V5 = sc->H[5];
-  V6 = sc->H[6];
-  V7 = sc->H[7];
-  V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
-  V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
-  VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
-  VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
-  VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
-                             m256_const1_64( CB4 ) );
-  VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
-                             m256_const1_64( CB5 ) );
-  VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
-                             m256_const1_64( CB6 ) );
-  VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
-                             m256_const1_64( CB7 ) );
-  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
-
-  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
-
-  ROUND_B_4WAY(0);
-  ROUND_B_4WAY(1);
-  ROUND_B_4WAY(2);
-  ROUND_B_4WAY(3);
-  ROUND_B_4WAY(4);
-  ROUND_B_4WAY(5);
-  ROUND_B_4WAY(6);
-  ROUND_B_4WAY(7);
-  ROUND_B_4WAY(8);
-  ROUND_B_4WAY(9);
-  ROUND_B_4WAY(0);
-  ROUND_B_4WAY(1);
-  ROUND_B_4WAY(2);
-  ROUND_B_4WAY(3);
-  ROUND_B_4WAY(4);
-  ROUND_B_4WAY(5);
-
-  sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
-  sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
-  sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
-  sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
-  sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
-  sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
-  sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
-  sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
-}
-
 void blake512_4way_init( blake_4way_big_context *sc )
 {
   __m256i zero = m256_zero;
@@ -901,12 +681,10 @@ void blake512_4way_init( blake_4way_big_context *sc )
   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
-
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
 }
@@ -925,31 +703,31 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
   ptr = sc->ptr;
   if ( len < (buf_size - ptr) )
   {
-   	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
-	   ptr += len;
-	   sc->ptr = ptr;
-	   return;
+	memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+	ptr += len;
+	sc->ptr = ptr;
+	return;
   }

   READ_STATE64(sc);
   while ( len > 0 )
   {
-   	size_t clen;
+	size_t clen;

-	   clen = buf_size - ptr;
-	   if ( clen > len )
-		   clen = len;
-   	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
-	   ptr += clen;
-	   vdata = vdata + (clen>>3);
-	   len -= clen;
-	   if ( ptr == buf_size )
-      {
-		   if ( (T0 = T0 + 1024 ) < 1024 )
-			   T1 = SPH_T64(T1 + 1);
-	   	COMPRESS64_4WAY;
-		   ptr = 0;
-	   }
+	clen = buf_size - ptr;
+	if ( clen > len )
+		clen = len;
+	memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+	ptr += clen;
+	vdata = vdata + (clen>>3);
+	len -= clen;
+	if (ptr == buf_size )
+        {
+		if ((T0 = SPH_T64(T0 + 1024)) < 1024)
+			T1 = SPH_T64(T1 + 1);
+		COMPRESS64_4WAY;
+		ptr = 0;
+	}
   }
   WRITE_STATE64(sc);
   sc->ptr = ptr;
@@ -961,7 +739,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   uint64_t th, tl;
+   sph_u64 th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -970,13 +748,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
-	sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
-	sc->T1 = sc->T1 - 1;
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
+	sc->T1 = SPH_T64(sc->T1 - 1);
   } 
   else
   {
@@ -1010,77 +788,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }

-// init, update & close
-void blake512_4way_full( blake_4way_big_context *sc, void * dst,
-                         const void *data, size_t len )
+/*
+void
+blake512_4way_init(void *cc)
 {
-
-// init
-
-   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
-   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
-   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
-   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
-   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
-
-   casti_m256i( sc->S, 0 ) = m256_zero;
-   casti_m256i( sc->S, 1 ) = m256_zero;
-   casti_m256i( sc->S, 2 ) = m256_zero;
-   casti_m256i( sc->S, 3 ) = m256_zero;
-
-   sc->T0 = sc->T1 = 0;
-   sc->ptr = 0;
-
-// update
-
-   memcpy_256( sc->buf, (__m256i*)data, len>>3 );
-   sc->ptr += len;
-   if ( len == 128 )
-   {
-      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
-         sc->T1 =  sc->T1 + 1;
-      blake512_4way_compress( sc );
-      sc->ptr = 0;
-   }
-
-// close
-
-   size_t ptr64 = sc->ptr >> 3;
-   unsigned bit_len;
-   uint64_t th, tl;
-
-   bit_len = sc->ptr << 3;
-   sc->buf[ptr64] = m256_const1_64( 0x80 );
-   tl = sc->T0 + bit_len;
-   th = sc->T1;
-   if ( sc->ptr == 0 )
-   {
-      sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
-      sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
-   }
-   else if ( sc->T0 == 0 )
-   {
-      sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
-      sc->T1 = sc->T1 - 1;
-   }
-   else
-        sc->T0 -= 1024 - bit_len;
-
-   memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
-   sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
-   sc->buf[14] = m256_const1_64( bswap_64( th ) );
-   sc->buf[15] = m256_const1_64( bswap_64( tl ) );
-
-   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
-       sc->T1 = sc->T1 + 1;
-
-   blake512_4way_compress( sc );
-
-   mm256_block_bswap_64( (__m256i*)dst, sc->H );
+	blake64_4way_init(cc, IV512, salt_zero_big);
 }
+*/

 void
 blake512_4way_update(void *cc, const void *data, size_t len)
@@ -1092,8 +806,17 @@ void
 blake512_4way_close(void *cc, void *dst)
 {
   blake64_4way_close( cc, dst );
+
+//   blake512_4way_addbits_and_close(cc, dst);
 }

+/*
+void
+blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake64_4way_close(cc, ub, n, dst, 8);
+}
+*/
 #ifdef __cplusplus
 }
 #endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -14,7 +14,7 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
+     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );

   do {
      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -71,7 +71,7 @@ void blakecoin_8way_hash( void *state, const void *input )
     blake256r8_8way_context ctx;

     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
-     blake256r8_8way_update( &ctx, input + (64<<3), 16 );
+     blake256r8_8way( &ctx, input + (64<<3), 16 );
     blake256r8_8way_close( &ctx, vhash );

     dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96, state+128,
@@ -95,7 +95,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   blake256r8_8way_init( &blakecoin_8w_ctx );
-   blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
+   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -21,7 +21,7 @@ void decred_hash_4way( void *state, const void *input )
     blake256_4way_context ctx __attribute__ ((aligned (64)));

     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-     blake256_4way_update( &ctx, tail, tail_len );
+     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -46,7 +46,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
   mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
-   blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
+   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -22,23 +22,23 @@ extern void pentablakehash_4way( void *output, const void *input )


     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, input, 80 );
+     blake512_4way( &ctx, input, 80 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
+     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
+     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
+     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
+     blake512_4way( &ctx, vhash, 64 );
     blake512_4way_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -168,66 +168,6 @@ int cube_4way_close( cube_4way_context *sp, void *output )
    return 0;
 }

-int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen, 
-                    const void *data, size_t size )
-{
-    __m512i *h = (__m512i*)sp->h;
-    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
-                                                : (__m128i*)IV256 );
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = 32/16;
-    sp->rounds    = 16;
-    sp->pos       = 0;
-
-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
-
-    const int len = size >> 4;
-    const __m512i *in = (__m512i*)data;
-    __m512i *hash = (__m512i*)output;
-    int i;
-
-    for ( i = 0; i < len; i++ )
-    {
-        sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
-        {
-           transform_4way( sp );
-           sp->pos = 0;
-        }
-    }
-
-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
-                                    m512_const2_64( 0, 0x0000000000000080 ) );
-    transform_4way( sp );
-
-    sp->h[7] = _mm512_xor_si512( sp->h[7],
-                                    m512_const2_64( 0x0000000100000000, 0 ) );
-
-    for ( i = 0; i < 10; ++i )
-       transform_4way( sp );
-
-    memcpy( hash, sp->h, sp->hashlen<<6);
-    return 0;
-}
-
-
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                               const void *data, size_t size )
 {
@@ -436,62 +376,4 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
    return 0;
 }

-int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
-                               const void *data, size_t size )
-{
-    __m256i *h = (__m256i*)sp->h;
-    __m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
-                                                : (__m128i*)IV256 );
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = 32/16;
-    sp->rounds    = 16;
-    sp->pos       = 0;
-
-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
-
-    const int len = size >> 4;
-    const __m256i *in = (__m256i*)data;
-    __m256i *hash = (__m256i*)output;
-    int i;
-
-    for ( i = 0; i < len; i++ )
-    {
-        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
-        {
-           transform_2way( sp );
-           sp->pos = 0;
-        }
-    }
-
-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                                    m256_const2_64( 0, 0x0000000000000080 ) );
-    transform_2way( sp );
-
-    sp->h[7] = _mm256_xor_si256( sp->h[7],
-                                    m256_const2_64( 0x0000000100000000, 0 ) );
-
-    for ( i = 0; i < 10; ++i )    transform_2way( sp );
-
-    memcpy( hash, sp->h, sp->hashlen<<5 );
-    return 0;
-}
-
 #endif
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -21,12 +21,15 @@ typedef struct _cube_4way_context cube_4way_context;

 int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_4way_reinit( cube_4way_context *sp );
+
 int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
+
 int cube_4way_close( cube_4way_context *sp, void *output );
+
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                            const void *data, size_t size );
-int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
-                    const void *data, size_t size );

 #endif

@@ -45,12 +48,15 @@ typedef struct _cube_2way_context cube_2way_context;

 int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_2way_reinit( cube_2way_context *sp );
+
 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
+
 int cube_2way_close( cube_2way_context *sp, void *output );
+
 int cube_2way_update_close( cube_2way_context *sp, void *output,
                            const void *data, size_t size );
-int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
-                    const void *data, size_t size );


 #endif
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -230,10 +230,11 @@ int cubehashDigest( cubehashParam *sp, byte *digest )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
+                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                                    0,0,0,0, 0,0,0,0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
    transform( sp );
    transform( sp );
    transform( sp );
@@ -275,89 +276,11 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
+                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                                    0,0,0,0, 0,0,0,0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
-
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-    transform( sp );
-
-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->x[i];
-
-    return SUCCESS;
-}
-
-int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
-                          const byte *data, size_t size )
-{
-    __m128i *x = (__m128i*)sp->x;
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = 32/16;
-    sp->rounds    = 16;
-    sp->pos       = 0;
-
-    if ( hashbitlen == 512 )
-    {
-
-       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
-    }
-    else
-    {
-       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
-    }
-
-
-
-
-    const int len = size / 16;
-    const __m128i* in = (__m128i*)data;
-    __m128i* hash = (__m128i*)digest;
-    int i;
-
-    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
-
-    for ( i = 0; i < len; i++ )
-    {
-        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
-        sp->pos++;
-        if ( sp->pos == sp->blocksize )
-        {
-           transform( sp );
-           sp->pos = 0;
-        }
-    }
-
-    // pos is zero for 64 byte data, 1 for 80 byte data.
-    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
-    transform( sp );
-
-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );

    transform( sp );
    transform( sp );
--- a/algo/cubehash/cubehash_sse2.h
+++ b/algo/cubehash/cubehash_sse2.h
@@ -19,7 +19,7 @@ struct _cubehashParam
    int rounds;
    int blocksize;         // __m128i
    int pos;	           // number of __m128i read into x from current block
-    __m128i _ALIGN(64) x[8];  // aligned for __m256i
+    __m128i _ALIGN(256) x[8];  // aligned for __m256i
 };

 typedef struct _cubehashParam cubehashParam;
@@ -39,9 +39,6 @@ int cubehashDigest(cubehashParam* sp, byte *digest);
 int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
                          size_t size );

-int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
-                   const byte *data, size_t size );
-
 #ifdef __cplusplus
 }
 #endif
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -20,7 +20,6 @@
 #include "hash_api.h"
 //#include "vperm.h"
 #include <immintrin.h>
-#include "simd-utils.h"

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -518,165 +517,6 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   return SUCCESS;
 }

-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength datalen )
-{
-   int i, j;
-
-   state->k = m128_zero;
-   state->processed_bits = 0;
-   state->uBufferBytes = 0;
-
-   switch( nHashSize )
-   {
-      case 256:
-         state->uHashSize = 256;
-         state->uBlockLength = 192;
-         state->uRounds = 8;
-         state->hashsize = m128_const_64( 0, 0x100 );
-         state->const1536 = m128_const_64( 0, 0x600 );
-         break;
-
-      case 512:
-         state->uHashSize = 512;
-         state->uBlockLength = 128;
-         state->uRounds = 10;
-         state->hashsize = m128_const_64( 0, 0x200 );
-         state->const1536 = m128_const_64( 0, 0x400 );
-         break;
-
-      default:
-         return BAD_HASHBITLEN;
-   }
-
-   for(i = 0; i < 4; i++)
-      for(j = 0; j < nHashSize / 256; j++)
-         state->state[i][j] = state->hashsize;
-
-   for(i = 0; i < 4; i++)
-      for(j = nHashSize / 256; j < 4; j++)
-         state->state[i][j] = m128_zero;
-
-
-   unsigned int uBlockCount, uRemainingBytes;
-
-   if( (state->uBufferBytes + datalen) >= state->uBlockLength )
-   {
-        if( state->uBufferBytes != 0 )
-        {
-           // Fill the buffer
-           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
-
-           // Process buffer
-           Compress( state, state->buffer, 1 );
-           state->processed_bits += state->uBlockLength * 8;
-
-           data += state->uBlockLength - state->uBufferBytes;
-           datalen -= state->uBlockLength - state->uBufferBytes;
-        }
-
-        // buffer now does not contain any unprocessed bytes
-
-        uBlockCount = datalen / state->uBlockLength;
-        uRemainingBytes = datalen % state->uBlockLength;
-
-        if( uBlockCount > 0 )
-        {
-           Compress( state, data, uBlockCount );
-           state->processed_bits += uBlockCount * state->uBlockLength * 8;
-           data += uBlockCount * state->uBlockLength;
-        }
-
-        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
-
-        state->uBufferBytes = uRemainingBytes;
-   }
-   else
-   {
-        memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen );
-        state->uBufferBytes += datalen;
-   }
-
-   __m128i remainingbits;
-
-   // Add remaining bytes in the buffer
-   state->processed_bits += state->uBufferBytes * 8;
-
-   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
-
-   // Pad with 0x80
-   state->buffer[state->uBufferBytes++] = 0x80;
-   // Enough buffer space for padding in this block?
-   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
-   {
-        // Pad with zeros
-        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
-
-        // Hash size
-        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
-
-        // Processed bits
-        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-
-        // Last block contains message bits?
-        if( state->uBufferBytes == 1 )
-        {
-           state->k = _mm_xor_si128( state->k, state->k );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
-        }
-        else
-        {
-           state->k = _mm_add_epi64( state->k, remainingbits );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
-        }
-
-        // Compress
-        Compress( state, state->buffer, 1 );
-   }
-   else
-   {
-        // Fill with zero and compress
-        memset( state->buffer + state->uBufferBytes, 0,
-                state->uBlockLength - state->uBufferBytes );
-        state->k = _mm_add_epi64( state->k, remainingbits );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-        Compress( state, state->buffer, 1 );
-
-        // Last block
-        memset( state->buffer, 0, state->uBlockLength - 18 );
-
-        // Hash size
-        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
-                 state->uHashSize;
-
-        // Processed bits
-        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-        // Compress the last block
-        state->k = _mm_xor_si128( state->k, state->k );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-        Compress( state, state->buffer, 1) ;
-   }
-
-   // Store the hash value
-   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
-
-   if( state->uHashSize == 512 )
-   {
-        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
-
-   }
-   return SUCCESS;
-}
-
-

 HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
 {
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -55,8 +55,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
                              const BitSequence *data, DataLength databitlen );
-HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
-            int nHashSize, const BitSequence *data, DataLength databitlen );

 #endif // HASH_API_H

--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -313,92 +313,4 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
   return 0;
 }

-int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize, 
-                    const void *data, int datalen )
-{
-   int i, j;
-   int databitlen = datalen * 8;
-   ctx->k = m512_zero;
-   ctx->processed_bits = 0;
-   ctx->uBufferBytes = 0;
-
-   switch( nHashSize )
-   {
-      case 256:
-         ctx->uHashSize = 256;
-         ctx->uBlockLength = 192;
-         ctx->uRounds = 8;
-         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
-         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
-         break;
-
-      case 512:
-         ctx->uHashSize = 512;
-         ctx->uBlockLength = 128;
-         ctx->uRounds = 10;
-         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
-         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
-         break;
-
-      default:
-         return 1;
-   }
-
-   for( i = 0; i < 4; i++ )
-      for( j = 0; j < nHashSize / 256; j++ )
-         ctx->state[ i ][ j ] = ctx->hashsize;
-
-   for( i = 0; i < 4; i++ )
-      for( j = nHashSize / 256; j < 4; j++ )
-         ctx->state[ i ][ j ] = m512_zero;
-
-   
-// bytelen is either 32 (maybe), 64 or 80 or 128!
-// all are less than full block.
-
-   int vlen = datalen / 32;  
-   const int vblen = ctx->uBlockLength / 16; //  16 bytes per lane
-   __m512i remainingbits;
-
-   if ( databitlen == 1024 )
-   {
-      echo_4way_compress( ctx, data, 1 );
-      ctx->processed_bits = 1024;
-      remainingbits = m512_const2_64( 0, -1024 );
-      vlen = 0;
-   }
-   else
-   {
-      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
-      memcpy_512( ctx->buffer, data, vlen );
-      ctx->processed_bits += (unsigned int)( databitlen );
-      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
-
-   }
-
-   ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
-   memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
-   ctx->buffer[ vblen-2 ] =
-                _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
-   ctx->buffer[ vblen-1 ] =
-                   _mm512_set4_epi64( 0, ctx->processed_bits,
-                                      0, ctx->processed_bits );
-
-   ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
-   ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
-
-   echo_4way_compress( ctx, ctx->buffer, 1 );
-
-   _mm512_store_si512( (__m512i*)hashval + 0, ctx->state[ 0 ][ 0] );
-   _mm512_store_si512( (__m512i*)hashval + 1, ctx->state[ 1 ][ 0] );
-
-   if ( ctx->uHashSize == 512 )
-   {
-      _mm512_store_si512( (__m512i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
-      _mm512_store_si512( (__m512i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
-   }
-   return 0;
-}
-
-
 #endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -32,8 +32,5 @@ int echo_close( echo_4way_context *state, void *hashval );
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                              const void *data, int databitlen );

-int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
-                    const void *data, int datalen );
-
 #endif 
 #endif
--- a/algo/groestl/aes_ni/groestl-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl-asm-aes.h
--- a/algo/groestl/aes_ni/groestl-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl-asm-avx.h
--- a/algo/groestl/aes_ni/groestl-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl-asm-vperm.h
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -11,6 +11,17 @@
 #include <wmmintrin.h>
 #include "hash-groestl.h"

+/* global constants  */
+__m128i ROUND_CONST_Lx;
+//__m128i ROUND_CONST_L0[ROUNDS512];
+//__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;
+
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -100,7 +111,7 @@
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = ALL_1B;\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -141,41 +152,24 @@
 }/*MixBytes*/


-static const uint64_t round_const_p[] __attribute__ ((aligned (64))) =
-{
-  0x7060504030201000, 0xf0e0d0c0b0a09080,
-  0x7161514131211101, 0xf1e1d1c1b1a19181,
-  0x7262524232221202, 0xf2e2d2c2b2a29282,
-  0x7363534333231303, 0xf3e3d3c3b3a39383,
-  0x7464544434241404, 0xf4e4d4c4b4a49484,
-  0x7565554535251505, 0xf5e5d5c5b5a59585,
-  0x7666564636261606, 0xf6e6d6c6b6a69686,
-  0x7767574737271707, 0xf7e7d7c7b7a79787,
-  0x7868584838281808, 0xf8e8d8c8b8a89888,
-  0x7969594939291909, 0xf9e9d9c9b9a99989,
-  0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a,
-  0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b,
-  0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c,
-  0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d
-};
-
-static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
-{
-  0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f,
-  0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e,
-  0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d,
-  0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c,
-  0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b,
-  0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a,
-  0x8999a9b9c9d9e9f9, 0x0919293949596979,
-  0x8898a8b8c8d8e8f8, 0x0818283848586878,
-  0x8797a7b7c7d7e7f7, 0x0717273747576777,
-  0x8696a6b6c6d6e6f6, 0x0616263646566676,
-  0x8595a5b5c5d5e5f5, 0x0515253545556575,
-  0x8494a4b4c4d4e4f4, 0x0414243444546474,
-  0x8393a3b3c3d3e3f3, 0x0313233343536373,
-  0x8292a2b2c2d2e2f2, 0x0212223242526272
-};
+#define SET_CONSTANTS(){\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
+  for(i = 0; i < ROUNDS1024; i++)\
+  {\
+    ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
+  }\
+}while(0);\

 /* one round
 * a0-a7 = input rows
@@ -200,50 +194,30 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm_xor_si128( xmm8, \
-             casti_m128i( round_const_p, round_counter ) ); \
+    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
     /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x0306090c0f020508, \
-                                                    0x0b0e0104070a0d00 ) ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x04070a0d00030609, \
-                                                    0x0c0f0205080b0e01 ) ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x05080b0e0104070a, \
-                                                    0x0d000306090c0f02 ) ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x06090c0f0205080b, \
-                                                    0x0e0104070a0d0003 ) ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x070a0d000306090c, \
-                                                    0x0f0205080b0e0104 ) ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x080b0e0104070a0d, \
-                                                    0x000306090c0f0205 ) ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x090c0f0205080b0e, \
-                                                    0x0104070a0d000306 ) ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x0e0104070a0d0003, \
-                                                    0x06090c0f0205080b ) ); \
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
    /* SubBytes + MixBytes */\
-    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
-            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
    /* AddRoundConstant P1024 */\
-    xmm0 = _mm_xor_si128( xmm0, \
-             casti_m128i( round_const_p, round_counter+1 ) ); \
-    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x0306090c0f020508, \
-                                                  0x0b0e0104070a0d00 ) ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x04070a0d00030609, \
-                                                  0x0c0f0205080b0e01 ) ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x05080b0e0104070a, \
-                                                  0x0d000306090c0f02 ) ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x06090c0f0205080b, \
-                                                  0x0e0104070a0d0003 ) ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x070a0d000306090c, \
-                                                  0x0f0205080b0e0104 ) ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x080b0e0104070a0d, \
-                                                  0x000306090c0f0205 ) ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x090c0f0205080b0e, \
-                                                  0x0104070a0d000306 ) ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x0e0104070a0d0003, \
-                                                  0x06090c0f0205080b ) ); \
-    SUBMIX( xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
-            xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
+    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }

@@ -251,68 +225,48 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant Q1024 */\
-    xmm1 = m128_neg1;\
-    xmm8  = _mm_xor_si128( xmm8,  xmm1 ); \
-    xmm9  = _mm_xor_si128( xmm9,  xmm1 ); \
-    xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
-    xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
-    xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
-    xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
-    xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
-    xmm15 = _mm_xor_si128( xmm15, \
-              casti_m128i( round_const_q, round_counter ) ); \
+    xmm1 = ALL_FF;\
+    xmm8  = _mm_xor_si128(xmm8,  xmm1);\
+    xmm9  = _mm_xor_si128(xmm9,  xmm1);\
+    xmm10 = _mm_xor_si128(xmm10, xmm1);\
+    xmm11 = _mm_xor_si128(xmm11, xmm1);\
+    xmm12 = _mm_xor_si128(xmm12, xmm1);\
+    xmm13 = _mm_xor_si128(xmm13, xmm1);\
+    xmm14 = _mm_xor_si128(xmm14, xmm1);\
+    xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x04070a0d00030609, \
-                                                    0x0c0f0205080b0e01 ) ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x06090c0f0205080b, \
-                                                    0x0e0104070a0d0003 ) ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x080b0e0104070a0d, \
-                                                    0x000306090c0f0205 ) ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x0e0104070a0d0003, \
-                                                    0x06090c0f0205080b ) ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x0306090c0f020508, \
-                                                    0x0b0e0104070a0d00 ) ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x05080b0e0104070a, \
-                                                    0x0d000306090c0f02 ) ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x070a0d000306090c, \
-                                                    0x0f0205080b0e0104 ) ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x090c0f0205080b0e, \
-                                                    0x0104070a0d000306 ) ); \
+    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[1]));\
+    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[3]));\
+    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
+    xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
+    xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
+    xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
+    xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
+    xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
    /* SubBytes + MixBytes */\
-    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
-            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6 , xmm7 ); \
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
    /* AddRoundConstant Q1024 */\
-    xmm9 = m128_neg1;\
-    xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
-    xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
-    xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
-    xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
-    xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
-    xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
-    xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
-    xmm7 = _mm_xor_si128( xmm7, \
-             casti_m128i( round_const_q, round_counter+1 ) ); \
+    xmm9 = ALL_FF;\
+    xmm0 = _mm_xor_si128(xmm0,  xmm9);\
+    xmm1 = _mm_xor_si128(xmm1,  xmm9);\
+    xmm2 = _mm_xor_si128(xmm2,  xmm9);\
+    xmm3 = _mm_xor_si128(xmm3,  xmm9);\
+    xmm4 = _mm_xor_si128(xmm4,  xmm9);\
+    xmm5 = _mm_xor_si128(xmm5,  xmm9);\
+    xmm6 = _mm_xor_si128(xmm6,  xmm9);\
+    xmm7 = _mm_xor_si128(xmm7,  (ROUND_CONST_Q[round_counter+1]));\
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x04070a0d00030609, \
-                                                  0x0c0f0205080b0e01 ) ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x06090c0f0205080b, \
-                                                  0x0e0104070a0d0003 ) ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x080b0e0104070a0d, \
-                                                  0x000306090c0f0205 ) ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x0e0104070a0d0003, \
-                                                  0x06090c0f0205080b ) ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x0306090c0f020508, \
-                                                  0x0b0e0104070a0d00 ) ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x05080b0e0104070a, \
-                                                  0x0d000306090c0f02 ) ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x070a0d000306090c, \
-                                                  0x0f0205080b0e0104 ) ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x090c0f0205080b0e, \
-                                                  0x0104070a0d000306 ) ); \
+    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
+    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
+    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
+    xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
+    xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
+    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
+    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
+    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
    /* SubBytes + MixBytes */\
-    SUBMIX( xmm0,  xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
-            xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }

@@ -324,7 +278,7 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
 * clobbers: t0-t7
 */
 #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 );\
+  t0 = TRANSP_MASK;\
 \
  i6 = _mm_shuffle_epi8(i6, t0);\
  i0 = _mm_shuffle_epi8(i0, t0);\
@@ -412,7 +366,7 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
  i4 = _mm_unpacklo_epi64(i4, i5);\
  t1 = _mm_unpackhi_epi64(t1, i5);\
  t2 = i6;\
-  o0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
+  o0 = TRANSP_MASK;\
  i6 = _mm_unpacklo_epi64(i6, i7);\
  t2 = _mm_unpackhi_epi64(t2, i7);\
  /* load transpose mask into a register, because it will be used 8 times */\
--- a/algo/groestl/aes_ni/groestl-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl-intr-avx.h
--- a/algo/groestl/aes_ni/groestl-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl-intr-vperm.h
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -0,0 +1,10 @@
+// specify assembly or intrinsics implementation
+//#define TASM
+#define TINTR
+
+// Not to be confused with AVX512VAES
+#define VAES
+// #define VAVX
+// #define VVPERM
+
+//#endif
--- a/algo/groestl/aes_ni/groestl256-asm-aes.h
+++ b/algo/groestl/aes_ni/groestl256-asm-aes.h
@@ -0,0 +1,529 @@
+/* groestl-asm-aes.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, and aes
+ * instructions.
+ * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
+  asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
+  asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
+  /* spill values y_4, y_5 to memory */\
+  asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  asm("movaps xmm"tostr(b1)", [ALL_1B]");\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
+  MUL2(a3, b0, b1);\
+  asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
+  MUL2(a4, b0, b1);\
+  asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  MUL2(a1, b0, b1);\
+  asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
+  MUL2(a2, b0, b1);\
+  asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+  MUL2(a5, b0, b1);\
+  asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
+  MUL2(a6, b0, b1);\
+  asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  MUL2(a7, b0, b1);\
+  asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
+  asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
+  asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+}/*MixBytes*/
+
+#define SET_CONSTANTS(){\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("pxor xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("pshufb     xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+  \
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+  \
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+  \
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+  \
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+  \
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+  \
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
--- a/algo/groestl/aes_ni/groestl256-asm-avx.h
+++ b/algo/groestl/aes_ni/groestl256-asm-avx.h
@@ -0,0 +1,519 @@
+/* groestl-asm-avx.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+
+/* global variables  */
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
+__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
+
+/* temporary variables  */
+__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}while(0);
+
+#define Push_All_Regs() do{\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}while(0);
+
+#define Pop_All_Regs() do{\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2v2(i, j, k, z){\
+  asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
+  asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
+  asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
+  asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
+  asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
+  asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
+  asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
+  asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* spill values y_4, y_5 to memory */\
+  asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
+  asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
+  asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
+  asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
+  asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
+  asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
+  asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
+  asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
+  asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
+  asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
+  asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
+  asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("vpxor   xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("vpxor   xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("vpxor   xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  /* ShiftBytes + SubBytes (interleaved) */\
+  asm ("vpxor xmm"tostr(b0)",  xmm"tostr(b0)",  xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
+  asm ("vpshufb     xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+\
+  asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
+  asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+  asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("vmovaps xmm12, [rdi+0*16]");
+  asm ("vmovaps xmm13, [rdi+1*16]");
+  asm ("vmovaps xmm14, [rdi+2*16]");
+  asm ("vmovaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("vmovaps [rdi+0*16], xmm12");
+  asm ("vmovaps [rdi+1*16], xmm2");
+  asm ("vmovaps [rdi+2*16], xmm6");
+  asm ("vmovaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("vmovaps xmm12, [rsi+0*16]");
+  asm ("vmovaps xmm13, [rsi+1*16]");
+  asm ("vmovaps xmm14, [rsi+2*16]");
+  asm ("vmovaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("vpxor xmm8, xmm12, [rdi+0*16]");
+  asm ("vpxor xmm0, xmm2,  [rdi+1*16]");
+  asm ("vpxor xmm4, xmm6,  [rdi+2*16]");
+  asm ("vpxor xmm5, xmm7,  [rdi+3*16]");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, xmm8");
+  asm ("vpxor xmm1, xmm1, xmm10");
+  asm ("vpxor xmm2, xmm2, xmm12");
+  asm ("vpxor xmm3, xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("vpxor xmm0, xmm0, [rdi+0*16]");
+  asm ("vpxor xmm1, xmm1, [rdi+1*16]");
+  asm ("vpxor xmm2, xmm2, [rdi+2*16]");
+  asm ("vpxor xmm3, xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("vmovaps [rdi+0*16], xmm0");
+  asm ("vmovaps [rdi+1*16], xmm1");
+  asm ("vmovaps [rdi+2*16], xmm2");
+  asm ("vmovaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("vmovaps xmm8,  [rdi+0*16]");
+  asm ("vmovaps xmm10, [rdi+1*16]");
+  asm ("vmovaps xmm12, [rdi+2*16]");
+  asm ("vmovaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("vpxor xmm8,  xmm8,  [rdi+0*16]");
+  asm ("vpxor xmm10, xmm10, [rdi+1*16]");
+  asm ("vpxor xmm12, xmm12, [rdi+2*16]");
+  asm ("vpxor xmm14, xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+
+  /* we only need to return the truncated half of the state */
+  asm ("vmovaps [rdi+2*16], xmm9");
+  asm ("vmovaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
--- a/algo/groestl/aes_ni/groestl256-asm-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-asm-vperm.h
@@ -0,0 +1,856 @@
+/* groestl-asm-vperm.h     Aug 2011
+ *
+ * Groestl implementation with inline assembly using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include "hash-groestl256.h"
+
+/* global constants  */
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
+__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
+__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
+__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
+__attribute__ ((aligned (16))) unsigned char ALL_15[16];
+__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
+__attribute__ ((aligned (16))) unsigned char ALL_63[16];
+__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
+__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
+__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
+
+/* temporary variables  */
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
+__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
+__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
+  ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
+  ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
+  ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
+  ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
+  ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
+  ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
+  ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
+  ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
+  ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
+  ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
+  ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
+  ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
+  ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
+  ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
+  ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
+  ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
+  ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
+  ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
+  ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
+  ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
+  ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
+  ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
+  ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
+  ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
+  ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
+  ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
+  ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
+  ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
+  ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
+/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
+  ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
+  ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
+  ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
+  ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
+  ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)", xmm"tostr(a0)"");\
+  asm ("pandn  xmm"tostr(t1)", xmm"tostr(a1)"");\
+  asm ("psrld  xmm"tostr(t0)", 4");\
+  asm ("psrld  xmm"tostr(t1)", 4");\
+  asm ("pand   xmm"tostr(a0)", xmm"tostr(c0)"");\
+  asm ("pand   xmm"tostr(a1)", xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
+  asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
+  asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
+  asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
+  asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
+  asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(a0)", xmm"tostr(t2)"");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(t3)"");\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
+  asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a1)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a2)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a3)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a4)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a5)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a6)",  xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(a7)",  xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  asm ("movdqa xmm"tostr(t0)",  xmm"tostr(c0)"");\
+  asm ("pandn  xmm"tostr(t0)",  xmm"tostr(a0)"");\
+  asm ("psrld  xmm"tostr(t0)",  4");\
+  asm ("pand   xmm"tostr(a0)",  xmm"tostr(c0)"");\
+  asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(a0)",  xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(t1)",  xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(t1)",  xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(t1)",  xmm"tostr(b0a)"");\
+  asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
+  asm ("pxor   xmm"tostr(b0a)", xmm"tostr(a0)"");\
+  asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
+  asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
+  asm ("pxor   xmm"tostr(b0b)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
+  asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
+  asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
+  asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(t0)"");\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
+  /* 1 */\
+  asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
+  asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
+  \
+  /* 2 */\
+  asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a4)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
+  \
+  /* 4 */\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(a6)"");\
+  /*asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(b7)"");\
+  \
+  /* 3 */\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a7)"");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
+  /*asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
+  asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(b0)"");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
+  \
+  /* 5 */\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a2)"");\
+  /*asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
+  asm ("pxor   xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b3)", xmm"tostr(b4)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(b4)"");\
+  \
+  /* 6 */\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
+  \
+  /* 7 */\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
+  asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
+  asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
+  \
+  /* 8 */\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
+  asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
+  asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
+  \
+  /* 9 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
+  asm ("pxor   xmm"tostr(b0)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 10 */\
+  asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
+  asm ("pxor   xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a1)"");\
+  asm ("pxor   xmm"tostr(b4)", xmm"tostr(a1)"");\
+  \
+  /* 11 */\
+  asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
+  asm ("pxor   xmm"tostr(b1)", xmm"tostr(a5)"");\
+  asm ("pxor   xmm"tostr(b6)", xmm"tostr(a5)"");\
+  \
+  /* 12 */\
+  asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
+  asm ("pxor   xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
+  asm ("pxor   xmm"tostr(b2)", xmm"tostr(a3)"");\
+  asm ("pxor   xmm"tostr(b5)", xmm"tostr(a3)"");\
+  \
+  /* 13 */\
+  asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
+  asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
+  asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
+  asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
+  asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
+}/**/
+
+//#if (LENGTH <= 256)
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
+  ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
+  ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
+  ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
+  ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
+  ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
+  ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
+  ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
+  ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
+  ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
+  ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
+  ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
+  ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
+  ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
+  ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
+  ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
+    ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL)  ^ 0x7060504030201000ULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL)  ^ 0x8f9fafbfcfdfefffULL;\
+    ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
+  }\
+  ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
+  ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
+}/**/
+
+#define Push_All_Regs(){\
+/*  not using any...
+    asm("push rax");\
+    asm("push rbx");\
+    asm("push rcx");*/\
+}/**/
+
+#define Pop_All_Regs(){\
+/*  not using any...
+    asm("pop rcx");\
+    asm("pop rbx");\
+    asm("pop rax");*/\
+}/**/
+
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
+  asm ("pxor   xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("pxor   xmm"tostr(a1)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a2)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a3)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
+  asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
+  asm ("pxor   xmm"tostr(a4)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
+  asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
+  asm ("pxor   xmm"tostr(a5)", xmm"tostr(b1)"");\
+  asm ("pxor   xmm"tostr(a6)", xmm"tostr(b1)"");\
+  asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
+  asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
+  asm ("pxor   xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
+  asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+  ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
+  ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
+  VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
+\
+  asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
+\
+  asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
+\
+  asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
+  asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
+\
+  asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
+  asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
+  asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
+  asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
+\
+  asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
+\
+  asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
+  asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
+  asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
+  asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i0)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o4)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o5)", xmm"tostr(i2)"");\
+  asm ("movdqa     xmm"tostr(o6)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
+  asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
+  asm ("movdqa     xmm"tostr(o7)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  asm ("movdqa     xmm"tostr(o0)", xmm"tostr(i0)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
+  asm ("movdqa     xmm"tostr(o1)", xmm"tostr(i2)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
+  asm ("movdqa     xmm"tostr(o2)", xmm"tostr(i4)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
+  asm ("movdqa     xmm"tostr(o3)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+  asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
+  asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
+  asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
+  asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
+  asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
+  asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
+  asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
+  asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
+  asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
+  asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
+  asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
+  asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
+  VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("pxor xmm1, [ALL_15]");\
+  asm ("pxor xmm2, [ALL_15]");\
+  asm ("pxor xmm3, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
+  asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
+  asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  asm ("movaps xmm0, [ROUND_CONST_Lx]");\
+  VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
+  asm ("pxor xmm0, [ALL_15]");\
+  asm ("movaps [ROUND_CONST_Lx], xmm0");\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT256(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  asm volatile ("emms");
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  asm ("movaps xmm12, [rdi+0*16]");
+  asm ("movaps xmm13, [rdi+1*16]");
+  asm ("movaps xmm14, [rdi+2*16]");
+  asm ("movaps xmm15, [rdi+3*16]");
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* store transposed IV */
+  asm ("movaps [rdi+0*16], xmm12");
+  asm ("movaps [rdi+1*16], xmm2");
+  asm ("movaps [rdi+2*16], xmm6");
+  asm ("movaps [rdi+3*16], xmm7");
+
+  asm volatile ("emms");
+  asm (".att_syntax noprefix");
+}
+
+void TF512(u64* h, u64* m)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+  /* message M in rsi            */
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load message into registers xmm12 - xmm15 (Q = message) */
+  asm ("movaps xmm12, [rsi+0*16]");
+  asm ("movaps xmm13, [rsi+1*16]");
+  asm ("movaps xmm14, [rsi+2*16]");
+  asm ("movaps xmm15, [rsi+3*16]");
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
+  Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  asm ("movaps xmm8, [rdi+0*16]");
+  asm ("movaps xmm0, [rdi+1*16]");
+  asm ("movaps xmm4, [rdi+2*16]");
+  asm ("movaps xmm5, [rdi+3*16]");
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  asm ("pxor xmm8, xmm12");
+  asm ("pxor xmm0, xmm2");
+  asm ("pxor xmm4, xmm6");
+  asm ("pxor xmm5, xmm7");
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  asm ("pxor xmm0, xmm8");
+  asm ("pxor xmm1, xmm10");
+  asm ("pxor xmm2, xmm12");
+  asm ("pxor xmm3, xmm14");
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  asm ("pxor xmm0, [rdi+0*16]");
+  asm ("pxor xmm1, [rdi+1*16]");
+  asm ("pxor xmm2, [rdi+2*16]");
+  asm ("pxor xmm3, [rdi+3*16]");
+
+  /* store CV */
+  asm ("movaps [rdi+0*16], xmm0");
+  asm ("movaps [rdi+1*16], xmm1");
+  asm ("movaps [rdi+2*16], xmm2");
+  asm ("movaps [rdi+3*16], xmm3");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  /* __cdecl calling convention: */
+  /* chaining value CV in rdi    */
+
+  asm (".intel_syntax noprefix");
+  Push_All_Regs();
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  asm ("movaps xmm8,  [rdi+0*16]");
+  asm ("movaps xmm10, [rdi+1*16]");
+  asm ("movaps xmm12, [rdi+2*16]");
+  asm ("movaps xmm14, [rdi+3*16]");
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  asm ("pxor xmm8,  [rdi+0*16]");
+  asm ("pxor xmm10, [rdi+1*16]");
+  asm ("pxor xmm12, [rdi+2*16]");
+  asm ("pxor xmm14, [rdi+3*16]");
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
+  VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
+
+  /* we only need to return the truncated half of the state */
+  asm ("movaps [rdi+2*16], xmm9");
+  asm ("movaps [rdi+3*16], xmm11");
+
+  Pop_All_Regs();
+  asm (".att_syntax noprefix");
+
+  return;
+}
+
+
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -11,6 +11,18 @@
 #include <wmmintrin.h>
 #include "hash-groestl256.h"

+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+//__m128i ROUND_CONST_P[ROUNDS1024];
+//__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;
+
+
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -101,7 +113,7 @@
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = ALL_1B;\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -141,35 +153,24 @@
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

-
-static const uint64_t round_const_l0[] __attribute__ ((aligned (64))) =
-{
-  0x7060504030201000, 0xffffffffffffffff,
-  0x7161514131211101, 0xffffffffffffffff,
-  0x7262524232221202, 0xffffffffffffffff,
-  0x7363534333231303, 0xffffffffffffffff,
-  0x7464544434241404, 0xffffffffffffffff,
-  0x7565554535251505, 0xffffffffffffffff,
-  0x7666564636261606, 0xffffffffffffffff,
-  0x7767574737271707, 0xffffffffffffffff,
-  0x7868584838281808, 0xffffffffffffffff,
-  0x7969594939291909, 0xffffffffffffffff
-};
-
-static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
-{
-0x0000000000000000, 0x8f9fafbfcfdfefff,
-0x0000000000000000, 0x8e9eaebecedeeefe,
-0x0000000000000000, 0x8d9dadbdcdddedfd,
-0x0000000000000000, 0x8c9cacbcccdcecfc,
-0x0000000000000000, 0x8b9babbbcbdbebfb,
-0x0000000000000000, 0x8a9aaabacadaeafa,
-0x0000000000000000, 0x8999a9b9c9d9e9f9,
-0x0000000000000000, 0x8898a8b8c8d8e8f8,
-0x0000000000000000, 0x8797a7b7c7d7e7f7,
-0x0000000000000000, 0x8696a6b6c6d6e6f6
-};
-
+#define SET_CONSTANTS(){\
+   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0); \

 /* one round
 * i = round number
@@ -178,42 +179,34 @@ static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
 */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
-  a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
-  a1 = _mm_xor_si128( a1, b1 ); \
-  a2 = _mm_xor_si128( a2, b1 ); \
-  a3 = _mm_xor_si128( a3, b1 ); \
-  a4 = _mm_xor_si128( a4, b1 ); \
-  a5 = _mm_xor_si128( a5, b1 ); \
-  a6 = _mm_xor_si128( a6, b1 ); \
-  a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8( a0, m128_const_64( 0x03060a0d08020509, \
-                                            0x0c0f0104070b0e00 ) ); \
-  a0 = _mm_aesenclast_si128( a0, b0 );\
-  a1 = _mm_shuffle_epi8( a1, m128_const_64( 0x04070c0f0a03060b, \
-                                            0x0e090205000d0801 ) ); \
-  a1 = _mm_aesenclast_si128( a1, b0 );\
-  a2 = _mm_shuffle_epi8( a2, m128_const_64( 0x05000e090c04070d, \
-                                            0x080b0306010f0a02 ) ); \
-  a2 = _mm_aesenclast_si128( a2, b0 );\
-  a3 = _mm_shuffle_epi8( a3, m128_const_64( 0x0601080b0e05000f, \
-                                            0x0a0d040702090c03 ) ); \
-  a3 = _mm_aesenclast_si128( a3, b0 );\
-  a4 = _mm_shuffle_epi8( a4, m128_const_64( 0x0702090c0f060108, \
-                                            0x0b0e0500030a0d04 ) ); \
-  a4 = _mm_aesenclast_si128( a4, b0 );\
-  a5 = _mm_shuffle_epi8( a5, m128_const_64( 0x00030b0e0907020a, \
-                                            0x0d080601040c0f05 ) ); \
-  a5 = _mm_aesenclast_si128( a5, b0 );\
-  a6 = _mm_shuffle_epi8( a6, m128_const_64( 0x01040d080b00030c, \
-                                            0x0f0a0702050e0906 ) ); \
-  a6 = _mm_aesenclast_si128( a6, b0 );\
-  a7 = _mm_shuffle_epi8( a7, m128_const_64( 0x02050f0a0d01040e, \
-                                            0x090c000306080b07 ) ); \
-  a7 = _mm_aesenclast_si128( a7, b0 );\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
  \
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -242,7 +235,7 @@ static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
 * clobbers: t0
 */
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
+  t0 = TRANSP_MASK;\
  \
  i0 = _mm_shuffle_epi8(i0, t0);\
  i1 = _mm_shuffle_epi8(i1, t0);\
--- a/algo/groestl/aes_ni/groestl256-intr-avx.h
+++ b/algo/groestl/aes_ni/groestl256-intr-avx.h
@@ -0,0 +1,482 @@
+/* groestl-intr-avx.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_FF;
+//#if LENGTH <= 256
+__m128i ALL_1B;
+//#else
+//__m256d ALL_1B;
+//#endif
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
+#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0);
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b
+ * xmm[z] has to be zero */
+#define VMUL2(i, j, k, z){\
+  j = _mm_cmpgt_epi8(z, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+}/**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
+  b0 = a2;\
+  b1 = a3;\
+  b2 = a4;\
+  b3 = a5;\
+  b4 = a6;\
+  b5 = a7;\
+  b6 = a0;\
+  b7 = a1;\
+  \
+  /* t_i = a_i + a_{i+1} */\
+  a0 = _mm_xor_si128(a0, a1);\
+  a1 = _mm_xor_si128(a1, a2);\
+  a2 = _mm_xor_si128(a2, a3);\
+  a3 = _mm_xor_si128(a3, a4);\
+  a4 = _mm_xor_si128(a4, a5);\
+  a5 = _mm_xor_si128(a5, a6);\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b3 = _mm_xor_si128(b3, a7);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b5 = _mm_xor_si128(b5, a1);\
+  b6 = _mm_xor_si128(b6, a2);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  b0 = _mm_xor_si128(b0, a6);\
+  b1 = _mm_xor_si128(b1, a7);\
+  b2 = _mm_xor_si128(b2, a0);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b4 = _mm_xor_si128(b4, a2);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  TEMP1 = b1;\
+  TEMP2 = b2;\
+  \
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b1 = a1;\
+  TEMP3 = a2;\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP3);\
+  \
+  /*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  b1 = ALL_1B;\
+  b2 = _mm_xor_si128(b2, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a0, b0, b1, b2);\
+  \
+  /* compute w_i :  add y_{i+4} */\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  a2 = _mm_xor_si128(a2, TEMP2);\
+  a3 = _mm_xor_si128(a3, b3);\
+  a4 = _mm_xor_si128(a4, b4);\
+  a5 = _mm_xor_si128(a5, b5);\
+  a6 = _mm_xor_si128(a6, b6);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /*compute v_i: double w_i */\
+  VMUL2(a0, b0, b1, b2);\
+  VMUL2(a1, b0, b1, b2);\
+  VMUL2(a2, b0, b1, b2);\
+  VMUL2(a3, b0, b1, b2);\
+  VMUL2(a4, b0, b1, b2);\
+  VMUL2(a5, b0, b1, b2);\
+  VMUL2(a6, b0, b1, b2);\
+  VMUL2(a7, b0, b1, b2);\
+  \
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  b0 = _mm_xor_si128(a3, TEMP0);\
+  b1 = _mm_xor_si128(a4, TEMP1);\
+  b2 = _mm_xor_si128(a5, TEMP2);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b6 = _mm_xor_si128(b6, a1);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/*MixBytes*/
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* Add Round Constant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = _mm_unpackhi_epi16(i0, i1);\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  t0 = _mm_unpackhi_epi16(i2, i3);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = _mm_unpackhi_epi32(i0, i2);\
+  o3 = _mm_unpackhi_epi32(o1, t0);\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = _mm_unpackhi_epi64(i0, i4);\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o2 = _mm_unpacklo_epi64(i1, i5);\
+  o3 = _mm_unpackhi_epi64(i1, i5);\
+  o4 = _mm_unpacklo_epi64(i2, i6);\
+  o5 = _mm_unpackhi_epi64(i2, i6);\
+  o6 = _mm_unpacklo_epi64(i3, i7);\
+  o7 = _mm_unpackhi_epi64(i3, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = _mm_unpackhi_epi64(i0, i1);\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o1 = _mm_unpackhi_epi64(i2, i3);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o2 = _mm_unpackhi_epi64(i4, i5);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o3 = _mm_unpackhi_epi64(i6, i7);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = _mm_unpackhi_epi64(i0, t0);\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i3 = _mm_unpackhi_epi64(i2, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i5 = _mm_unpackhi_epi64(i4, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i7 = _mm_unpackhi_epi64(i6, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT256(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value and xor message to CV to get input of P */
+  /* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm12, chaining[0]);
+  xmm0 = _mm_xor_si128(xmm2,  chaining[1]);
+  xmm4 = _mm_xor_si128(xmm6,  chaining[2]);
+  xmm5 = _mm_xor_si128(xmm7,  chaining[3]);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, chaining[0]);
+  xmm1 = _mm_xor_si128(xmm1, chaining[1]);
+  xmm2 = _mm_xor_si128(xmm2, chaining[2]);
+  xmm3 = _mm_xor_si128(xmm3, chaining[3]);
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+  static __m128i TEMP3;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+
--- a/algo/groestl/aes_ni/groestl256-intr-vperm.h
+++ b/algo/groestl/aes_ni/groestl256-intr-vperm.h
@@ -0,0 +1,793 @@
+/* groestl-intr-vperm.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3 instructions.
+ * Author: Günther A. Roland, Martin Schläffer
+ *
+ * Based on the vperm and aes_ni implementations of the hash function Groestl
+ * by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey
+ *
+ * This code is placed in the public domain
+ */
+
+#include <tmmintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+__m128i ROUND_CONST_P[ROUNDS1024];
+__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_0F;
+__m128i ALL_15;
+__m128i ALL_1B;
+__m128i ALL_63;
+__m128i ALL_FF;
+__m128i VPERM_IPT[2];
+__m128i VPERM_OPT[2];
+__m128i VPERM_INV[2];
+__m128i VPERM_SB1[2];
+__m128i VPERM_SB2[2];
+__m128i VPERM_SB4[2];
+__m128i VPERM_SBO[2];
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+#define SET_SHARED_CONSTANTS(){\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
+  ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
+  ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
+  VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
+  VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
+  VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
+  VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
+  VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
+  VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
+  VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
+  VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
+  VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
+  VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
+  VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
+  VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
+}/**/
+
+/* VPERM
+ * Transform w/o settings c*
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
+  t0 = c0;\
+  t1 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t1 = _mm_andnot_si128(t1, a1);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  t1 = _mm_srli_epi32(t1, 4);\
+  a0 = _mm_and_si128(a0, c0);\
+  a1 = _mm_and_si128(a1, c0);\
+  t2 = c2;\
+  t3 = c2;\
+  t2 = _mm_shuffle_epi8(t2, a0);\
+  t3 = _mm_shuffle_epi8(t3, a1);\
+  a0 = c1;\
+  a1 = c1;\
+  a0 = _mm_shuffle_epi8(a0, t0);\
+  a1 = _mm_shuffle_epi8(a1, t1);\
+  a0 = _mm_xor_si128(a0, t2);\
+  a1 = _mm_xor_si128(a1, t3);\
+}/**/
+
+#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
+  c0 = ALL_0F;\
+  c1 = ((__m128i*) table )[0];\
+  c2 = ((__m128i*) table )[1];\
+}/**/
+
+/* VPERM
+ * Transform
+ * transforms 2 rows to/from "vperm mode"
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0, a1 = 2 rows
+ * table = transformation table to use
+ * t*, c* = clobbers
+ * outputs:
+ * a0, a1 = 2 rows transformed with table
+ * */
+#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Transform State
+ * inputs:
+ * a0-a3 = state
+ * table = transformation table to use
+ * t* = clobbers
+ * outputs:
+ * a0-a3 = transformed state
+ * */
+#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
+  VPERM_Transform_Set_Const(table, c0, c1, c2);\
+  VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
+  VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Add Constant to State
+ * inputs:
+ * a0-a7 = state
+ * constant = constant to add
+ * t0 = clobber
+ * outputs:
+ * a0-a7 = state + constant
+ * */
+#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
+  t0 = constant;\
+  a0 = _mm_xor_si128(a0,  t0);\
+  a1 = _mm_xor_si128(a1,  t0);\
+  a2 = _mm_xor_si128(a2,  t0);\
+  a3 = _mm_xor_si128(a3,  t0);\
+  a4 = _mm_xor_si128(a4,  t0);\
+  a5 = _mm_xor_si128(a5,  t0);\
+  a6 = _mm_xor_si128(a6,  t0);\
+  a7 = _mm_xor_si128(a7,  t0);\
+}/**/
+
+/* VPERM
+ * Set Substitute Core Constants
+ * */
+#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
+  VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
+}/**/
+
+/* VPERM
+ * Substitute Core
+ * first part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0 = 1 row
+ * t*, c* = clobbers
+ * outputs:
+ * b0a, b0b = inputs for lookup step
+ * */
+#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
+  t0 = c0;\
+  t0 = _mm_andnot_si128(t0, a0);\
+  t0 = _mm_srli_epi32(t0, 4);\
+  a0 = _mm_and_si128(a0,  c0);\
+  b0a = c1;\
+  b0a = _mm_shuffle_epi8(b0a, a0);\
+  a0 = _mm_xor_si128(a0,  t0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t0);\
+  b0b = _mm_xor_si128(b0b, b0a);\
+  t1 = c2;\
+  t1 = _mm_shuffle_epi8(t1,  a0);\
+  t1 = _mm_xor_si128(t1,  b0a);\
+  b0a = c2;\
+  b0a = _mm_shuffle_epi8(b0a, b0b);\
+  b0a = _mm_xor_si128(b0a, a0);\
+  b0b = c2;\
+  b0b = _mm_shuffle_epi8(b0b, t1);\
+  b0b = _mm_xor_si128(b0b, t0);\
+}/**/
+
+/* VPERM
+ * Lookup
+ * second part of sbox inverse computation
+ * this function is derived from:
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0a, a0b = output of Substitution Core
+ * table = lookup table to use (*1 / *2 / *4)
+ * t0 = clobber
+ * outputs:
+ * b0 = output of sbox + multiplication
+ * */
+#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
+  b0 = ((__m128i*) table )[0];\
+  t0 = ((__m128i*) table )[1];\
+  b0 = _mm_shuffle_epi8(b0, a0b);\
+  t0 = _mm_shuffle_epi8(t0, a0a);\
+  b0 = _mm_xor_si128(b0, t0);\
+}/**/
+
+/* VPERM
+ * SubBytes and *2 / *4
+ * this function is derived from:
+ *   Constant-time SSSE3 AES core implementation
+ *   by Mike Hamburg
+ * and
+ *   vperm and aes_ni implementations of hash function Grostl
+ *   by Cagdas CALIK
+ * inputs:
+ * a0-a7 = state
+ * t*, c* = clobbers
+ * outputs:
+ * a0-a7 = state * 4
+ * c2 = row0 * 2 -> b0
+ * c1 = row7 * 2 -> b3
+ * c0 = row7 * 1 -> b4
+ * t2 = row4 * 1 -> b7
+ * TEMP_MUL1 = row(i) * 1
+ * TEMP_MUL2 = row(i) * 2
+ *
+ * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
+#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
+  /* set Constants */\
+  VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
+  /* row 1 */\
+  VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[1] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[1] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
+  /* --- */\
+  /* row 2 */\
+  VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[2] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[2] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
+  /* --- */\
+  /* row 3 */\
+  VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[3] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[3] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
+  /* --- */\
+  /* row 5 */\
+  VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[5] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[5] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
+  /* --- */\
+  /* row 6 */\
+  VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[6] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[6] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
+  /* --- */\
+  /* row 7 */\
+  VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
+  TEMP_MUL1[7] = t2;\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
+  /* --- */\
+  /* row 4 */\
+  VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
+  TEMP_MUL2[4] = t3;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
+  /* --- */\
+  /* row 0 */\
+  VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
+  VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
+  VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
+  TEMP_MUL2[0] = c2;\
+  VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
+  /* --- */\
+}/**/
+
+
+/* Optimized MixBytes
+ * inputs:
+ * a0-a7 = (row0-row7) * 4
+ * b0 = row0 * 2
+ * b3 = row7 * 2
+ * b4 = row7 * 1
+ * b7 = row4 * 1
+ * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
+ * output: b0-b7
+ * */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* save one value */\
+  TEMP_MUL4 = a3;\
+  /* 1 */\
+  b1 = a0;\
+  b1 = _mm_xor_si128(b1, a5);\
+  b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
+  b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
+  b2 = b1;\
+  \
+  /* 2 */\
+  b5 = a1;\
+  b5 = _mm_xor_si128(b5, a4);\
+  b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
+  b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
+  b6 = b5;\
+  \
+  /* 4 */\
+  b7 = _mm_xor_si128(b7, a6);\
+  /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
+  b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
+  b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
+  b2 = _mm_xor_si128(b2, b7);\
+  \
+  /* 3 */\
+  b0 = _mm_xor_si128(b0, a7);\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
+  /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
+  b3 = b0;\
+  b1 = _mm_xor_si128(b1, b0);\
+  b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
+  \
+  /* 5 */\
+  b4 = _mm_xor_si128(b4, a2);\
+  /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
+  b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
+  b3 = _mm_xor_si128(b3, b4);\
+  b6 = _mm_xor_si128(b6, b4);\
+  \
+  /* 6 */\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
+  b4 = _mm_xor_si128(b4, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  b7 = _mm_xor_si128(b7, a3);\
+  \
+  /* 7 */\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
+  b2 = _mm_xor_si128(b2, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  \
+  /* 8 */\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
+  b6 = _mm_xor_si128(b6, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  \
+  /* 9 */\
+  a3 = TEMP_MUL1[2];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
+  b0 = _mm_xor_si128(b0, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 10 */\
+  a1 = TEMP_MUL1[6];\
+  a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
+  b1 = _mm_xor_si128(b1, a1);\
+  b4 = _mm_xor_si128(b4, a1);\
+  \
+  /* 11 */\
+  a5 = TEMP_MUL1[3];\
+  a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
+  b1 = _mm_xor_si128(b1, a5);\
+  b6 = _mm_xor_si128(b6, a5);\
+  \
+  /* 12 */\
+  a3 = TEMP_MUL1[7];\
+  a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
+  b2 = _mm_xor_si128(b2, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* 13 */\
+  b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
+  b0 = _mm_xor_si128(b0, a4);\
+  b1 = _mm_xor_si128(b1, a4);\
+  b3 = _mm_xor_si128(b3, a6);\
+  b4 = _mm_xor_si128(b4, a0);\
+  b4 = _mm_xor_si128(b4, a7);\
+  b5 = _mm_xor_si128(b5, a0);\
+  b7 = _mm_xor_si128(b7, a2);\
+}/**/
+
+#define SET_CONSTANTS(){\
+  SET_SHARED_CONSTANTS();\
+  SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}/**/
+
+/* vperm:
+ * transformation before rounds with ipt
+ * first round add transformed constant
+ * middle rounds: add constant XOR 0x15...15
+ * last round: additionally add 0x15...15 after MB
+ * transformation after rounds with opt
+ */
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant + ShiftBytes (interleaved) */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a4 = _mm_xor_si128(a4, b1);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  /* SubBytes + Multiplication by 2 and 4 */\
+  VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}/**/
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
+}
+
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+\
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+\
+  o1 = i0;\
+  t0 = i2;\
+\
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+\
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+\
+  o2 = i0;\
+  o3 = o1;\
+\
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst_CNT2(i, j){\
+  xmm0 = ROUND_CONST_L0[i];\
+  xmm1 = ROUND_CONST_L7[i];\
+  xmm2 = ROUND_CONST_L0[j];\
+  xmm3 = ROUND_CONST_L7[j];\
+  VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
+  xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
+  xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
+  ROUND_CONST_L0[i] = xmm0;\
+  ROUND_CONST_L7[i] = xmm1;\
+  ROUND_CONST_L0[j] = xmm2;\
+  ROUND_CONST_L7[j] = xmm3;\
+}/**/
+
+/* transform round constants into VPERM mode */
+#define VPERM_Transform_RoundConst(){\
+  xmm0 = ROUND_CONST_Lx;\
+  VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
+  xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
+  ROUND_CONST_Lx = xmm0;\
+  VPERM_Transform_RoundConst_CNT2(0, 1);\
+  VPERM_Transform_RoundConst_CNT2(2, 3);\
+  VPERM_Transform_RoundConst_CNT2(4, 5);\
+  VPERM_Transform_RoundConst_CNT2(6, 7);\
+  VPERM_Transform_RoundConst_CNT2(8, 9);\
+}/**/
+
+void INIT256(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* transform round constants into VPERM mode */
+  VPERM_Transform_RoundConst();
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512(u64* h, u64* m)
+{
+  __m128i* const chaining = (__m128i*) h;
+  __m128i* const message = (__m128i*) m;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+
+  return;
+}
+
+void OF512(u64* h)
+{
+  __m128i* const chaining = (__m128i*) h;
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP_MUL1[8];
+  static __m128i TEMP_MUL2[8];
+  static __m128i TEMP_MUL4;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+  VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+
+  return;
+}//OF512()
+
+
+
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -16,13 +16,48 @@

 #ifdef __AES__

-#include "groestl-intr-aes.h"
+#include "groestl-version.h"
+
+#ifdef TASM
+  #ifdef VAES
+    #include "groestl-asm-aes.h"
+  #else
+    #ifdef VAVX
+      #include "groestl-asm-avx.h"
+    #else
+      #ifdef VVPERM
+        #include "groestl-asm-vperm.h"
+      #else
+        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+      #endif
+    #endif
+  #endif
+#else
+  #ifdef TINTR
+    #ifdef VAES
+      #include "groestl-intr-aes.h"
+    #else
+      #ifdef VAVX
+        #include "groestl-intr-avx.h"
+      #else
+        #ifdef VVPERM
+          #include "groestl-intr-vperm.h"
+        #else
+          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+        #endif
+      #endif
+    #endif
+  #else
+    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+  #endif
+#endif

 HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 {
  int i;

  ctx->hashlen = hashlen;
+  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;
@@ -35,6 +70,8 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -55,6 +92,8 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
     ctx->buffer[i]   = _mm_setzero_si128();
  }
  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -70,7 +109,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 // 5. Midstate will work at reduced impact than full hash, if total hash
 //    (midstate + tail) is less than 1 block.
 //    This, unfortunately, is the case with all current users.
-// 6. the more full blocks the bigger the gain
+// 6. the morefull blocks the bigger the gain

 // use only for midstate precalc
 HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
@@ -104,11 +143,12 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
 // deprecated do not use
 HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
 {
-   const int len = (int)ctx->databitlen / 128; // bits to __m128i 
-   const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
-   const int rem_ptr = ctx->rem_ptr;           // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;     // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
+   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
+   const int blocks = ctx->blk_count + 1;       // adjust for final block
+
+   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;  // where in buffer
   int i;

   // first pad byte = 0x80, last pad byte = block count
@@ -117,18 +157,21 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   if ( rem_ptr == len - 1 )
   {
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
+       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                                  0,0,0,0, 0,0,0,0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
+       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                            0,0,0,0, 0,0,0,0x80 );
       // add zero padding
       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
+                                           0,         0 ,0,0, 0,0,0,0 );
   }

   // digest final padding block and do output transform
@@ -142,75 +185,6 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   return SUCCESS_GR;
 }

-int groestl512_full( hashState_groestl* ctx, void* output,
-                                const void* input, uint64_t databitlen )
-{
-
-   int i;
-   ctx->hashlen = 64;
-
-   for ( i = 0; i < SIZE512; i++ )
-   {
-      ctx->chaining[i] = _mm_setzero_si128();
-      ctx->buffer[i]   = _mm_setzero_si128();
-   }
-   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
-   ctx->buf_ptr = 0;
-   ctx->rem_ptr = 0;
-
-   // --- update ---
-   
-   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i;
-   int rem = ctx->rem_ptr;
-   uint64_t blocks = len / SIZE512;
-   __m128i* in = (__m128i*)input;
-
-   // digest any full blocks, process directly from input 
-   for ( i = 0; i < blocks; i++ )
-      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
-   ctx->buf_ptr = blocks * SIZE512;
-
-   // copy any remaining data to buffer, it may already contain data
-   // from a previous update for a midstate precalc
-   for ( i = 0; i < len % SIZE512; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem;    // use i as rem_ptr in final
-
-   //--- final ---
-
-   blocks++;      // adjust for final block
-
-   if ( i == len -1 )
-   {
-       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
-   }
-   else
-   {
-       // add first padding
-       ctx->buffer[i] = m128_const_64( 0, 0x80 );
-       // add zero padding
-       for ( i += 1; i < SIZE512 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-
-       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); 
-   }
-
-   // digest final padding block and do output transform
-   TF1024( ctx->chaining, ctx->buffer );
-   OF1024( ctx->chaining );
-
-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
-
-   return 0;
-}
-   
-
 HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
                                const void* input, DataLength_gr databitlen )
 {
@@ -218,7 +192,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
-   uint64_t blocks = len / SIZE512;
+   int blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;
   int i;

@@ -242,22 +216,26 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {        
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0x80 );
   }   
   else
   {
       // add first padding
-       ctx->buffer[i] = m128_const_64( 0, 0x80 );
+       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
+                                      0,0,0,0, 0,0,0,0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
+                                           0,         0 ,0,0, 0,0,0,0 );
   }

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -87,6 +87,5 @@ HashReturn_gr final_groestl( hashState_groestl*, void* );

 HashReturn_gr update_and_final_groestl( hashState_groestl*,  void*,
                                        const void*, DataLength_gr );
-int groestl512_full( hashState_groestl*,  void*, const void*, uint64_t );

 #endif /* __hash_h */
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -13,7 +13,41 @@

 #ifdef __AES__

-#include "groestl256-intr-aes.h"
+#include "groestl-version.h"
+
+#ifdef TASM
+  #ifdef VAES
+    #include "groestl256-asm-aes.h"
+  #else
+    #ifdef VAVX
+      #include "groestl256-asm-avx.h"
+    #else
+      #ifdef VVPERM
+        #include "groestl256-asm-vperm.h"
+      #else
+        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+      #endif
+    #endif
+  #endif
+#else
+  #ifdef TINTR
+    #ifdef VAES
+      #include "groestl256-intr-aes.h"
+    #else
+      #ifdef VAVX
+        #include "groestl256-intr-avx.h"
+      #else
+        #ifdef VVPERM
+          #include "groestl256-intr-vperm.h"
+        #else
+          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+        #endif
+      #endif
+    #endif
+  #else
+    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+  #endif
+#endif

 /* initialise context */
 HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
@@ -21,6 +55,7 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
  int i;

  ctx->hashlen = hashlen;
+  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -15,7 +15,7 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)


 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -18,8 +18,6 @@
 #endif
 #include <stdlib.h>

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   
 #define LENGTH (256)

 //#include "brg_endian.h"
@@ -71,5 +69,4 @@ int groestl256_4way_init( groestl256_4way_context*, uint64_t );
 int groestl256_4way_update_close( groestl256_4way_context*,  void*,
                                        const void*, uint64_t );

-#endif
 #endif 
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -15,22 +15,29 @@
 #include "miner.h"
 #include "simd-utils.h"

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
  int i;

+  ctx->hashlen = hashlen;
  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;

-  memset_zero_512( ctx->chaining, SIZE512 );
-  memset_zero_512( ctx->buffer, SIZE512 );
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
+  }

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
+//  uint64_t len = U64BIG((uint64_t)LENGTH);
+//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
+//  INIT_4way(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -42,7 +49,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {
   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = 64 / 16;   // bytes to __m128i
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE512;
@@ -51,13 +58,16 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,

   // --- update ---

+   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
   ctx->buf_ptr = blocks * SIZE512;

+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE512; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   i += rem; 
+   i += rem;    // use i as rem_ptr in final

   //--- final ---

@@ -71,71 +81,23 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
   }   
   else
   {
+       // add first padding
       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
   }

+// digest final padding block and do output transform
   TF1024_4way( ctx->chaining, ctx->buffer );
+
   OF1024_4way( ctx->chaining );

-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
-
-   return 0;
-}
-
-int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
-                          const void* input, uint64_t datalen )
-{
-   const int len = (int)datalen >> 4;
-   const int hashlen_m128i = 64 >> 4;   // bytes to __m128i
-   const int hash_offset = SIZE512 - hashlen_m128i;
-   uint64_t blocks = len / SIZE512;
-   __m512i* in = (__m512i*)input;
-   int i;
-
-   // --- init ---
-
-   SET_CONSTANTS();
-   memset_zero_512( ctx->chaining, SIZE512 );
-   memset_zero_512( ctx->buffer, SIZE512 );
-   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
-   ctx->buf_ptr = 0;
-   ctx->rem_ptr = 0;
-
-   // --- update ---
-
-   for ( i = 0; i < blocks; i++ )
-      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
-   ctx->buf_ptr = blocks * SIZE512;
-
-   for ( i = 0; i < len % SIZE512; i++ )
-       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
-   i += ctx->rem_ptr;
-
-   // --- close ---
-
-   blocks++;   
-
-   if ( i == SIZE512 - 1 )
-   {
-       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
-   }
-   else
-   {
-       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
-       for ( i += 1; i < SIZE512 - 1; i++ )
-           ctx->buffer[i] = m512_zero;
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
-   }
-
-   TF1024_4way( ctx->chaining, ctx->buffer );
-   OF1024_4way( ctx->chaining );
-
+   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];

--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -1,3 +1,11 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
 #if !defined(GROESTL512_HASH_4WAY_H__)
 #define GROESTL512_HASH_4WAY_H__ 1

@@ -10,10 +18,12 @@
 #endif
 #include <stdlib.h>

-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-
 #define LENGTH (512)

+//#include "brg_endian.h"
+//#define NEED_UINT_64T
+//#include "algo/sha/brg_types.h"
+
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
@@ -34,11 +44,34 @@
 #define ROUNDS (ROUNDS1024)
 //#endif

+/*
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif // IS_BIG_ENDIAN 
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif // IS_LITTLE_ENDIAN 
+
+typedef unsigned char BitSequence_gr;
+typedef unsigned long long DataLength_gr;
+typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
+*/
+
 #define SIZE512 (SIZE_1024/16)

 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
  __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
+  int hashlen;       // byte
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
  int rem_ptr;
@@ -52,11 +85,10 @@ int groestl512_4way_init( groestl512_4way_context*, uint64_t );

 int groestl512_4way_update( groestl512_4way_context*, const void*,
                              uint64_t );
+
 int groestl512_4way_close( groestl512_4way_context*, void* );
+
 int groestl512_4way_update_close( groestl512_4way_context*,  void*,
                                        const void*, uint64_t );
-int groestl512_4way_full( groestl512_4way_context*,  void*,
-                          const void*, uint64_t );

-#endif   // VAES
-#endif   // GROESTL512_HASH_4WAY_H__
+#endif /* __hash_h */
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -161,7 +161,7 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = SSE42_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -41,10 +41,57 @@
 extern "C"{
 #endif

+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif

+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define Sb_8W(x0, x1, x2, x3, c) \
@@ -105,97 +152,8 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-static const uint64_t C[] =
-{
-   0x67f815dfa2ded572, 0x571523b70a15847b,
-   0xf6875a4d90d6ab81, 0x402bd1c3c54f9f4e,
-   0x9cfa455ce03a98ea, 0x9a99b26699d2c503,
-   0x8a53bbf2b4960266, 0x31a2db881a1456b5,
-   0xdb0e199a5c5aa303, 0x1044c1870ab23f40,
-   0x1d959e848019051c, 0xdccde75eadeb336f,
-   0x416bbf029213ba10, 0xd027bbf7156578dc,
-   0x5078aa3739812c0a, 0xd3910041d2bf1a3f,
-   0x907eccf60d5a2d42, 0xce97c0929c9f62dd,
-   0xac442bc70ba75c18, 0x23fcc663d665dfd1,
-   0x1ab8e09e036c6e97, 0xa8ec6c447e450521,
-   0xfa618e5dbb03f1ee, 0x97818394b29796fd,
-   0x2f3003db37858e4a, 0x956a9ffb2d8d672a,
-   0x6c69b8f88173fe8a, 0x14427fc04672c78a,
-   0xc45ec7bd8f15f4c5, 0x80bb118fa76f4475,
-   0xbc88e4aeb775de52, 0xf4a3a6981e00b882,
-   0x1563a3a9338ff48e, 0x89f9b7d524565faa,
-   0xfde05a7c20edf1b6, 0x362c42065ae9ca36,
-   0x3d98fe4e433529ce, 0xa74b9a7374f93a53,
-   0x86814e6f591ff5d0, 0x9f5ad8af81ad9d0e,
-   0x6a6234ee670605a7, 0x2717b96ebe280b8b,
-   0x3f1080c626077447, 0x7b487ec66f7ea0e0,
-   0xc0a4f84aa50a550d, 0x9ef18e979fe7e391,
-   0xd48d605081727686, 0x62b0e5f3415a9e7e,
-   0x7a205440ec1f9ffc, 0x84c9f4ce001ae4e3,
-   0xd895fa9df594d74f, 0xa554c324117e2e55,
-   0x286efebd2872df5b, 0xb2c4a50fe27ff578,
-   0x2ed349eeef7c8905, 0x7f5928eb85937e44,
-   0x4a3124b337695f70, 0x65e4d61df128865e,
-   0xe720b95104771bc7, 0x8a87d423e843fe74,
-   0xf2947692a3e8297d, 0xc1d9309b097acbdd,
-   0xe01bdc5bfb301b1d, 0xbf829cf24f4924da,
-   0xffbf70b431bae7a4, 0x48bcf8de0544320d,
-   0x39d3bb5332fcae3b, 0xa08b29e0c1c39f45,
-   0x0f09aef7fd05c9e5, 0x34f1904212347094,
-   0x95ed44e301b771a2, 0x4a982f4f368e3be9,
-   0x15f66ca0631d4088, 0xffaf52874b44c147,
-   0x30c60ae2f14abb7e, 0xe68c6eccc5b67046,
-   0x00ca4fbd56a4d5a4, 0xae183ec84b849dda,
-   0xadd1643045ce5773, 0x67255c1468cea6e8,
-   0x16e10ecbf28cdaa3, 0x9a99949a5806e933,
-   0x7b846fc220b2601f, 0x1885d1a07facced1,
-   0xd319dd8da15b5932, 0x46b4a5aac01c9a50,
-   0xba6b04e467633d9f, 0x7eee560bab19caf6,
-   0x742128a9ea79b11f, 0xee51363b35f7bde9,
-   0x76d350755aac571d, 0x01707da3fec2463a,
-   0x42d8a498afc135f7, 0x79676b9e20eced78,
-   0xa8db3aea15638341, 0x832c83324d3bc3fa,
-   0xf347271c1f3b40a7, 0x9a762db734f04059,
-   0xfd4f21d26c4e3ee7, 0xef5957dc398dfdb8,
-   0xdaeb492b490c9b8d, 0x0d70f36849d7a25b,
-   0x84558d7ad0ae3b7d, 0x658ef8e4f0e9a5f5,
-   0x533b1036f4a2b8a0, 0x5aec3e759e07a80c,
-   0x4f88e85692946891, 0x4cbcbaf8555cb05b,
-   0x7b9487f3993bbbe3, 0x5d1c6b72d6f4da75,
-   0x6db334dc28acae64, 0x71db28b850a5346c,
-   0x2a518d10f2e261f8, 0xfc75dd593364dbe3,
-   0xa23fce43f1bcac1c, 0xb043e8023cd1bb67,
-   0x75a12988ca5b0a33, 0x5c5316b44d19347f,
-   0x1e4d790ec3943b92, 0x3fafeeb6d7757479,
-   0x21391abef7d4a8ea, 0x5127234c097ef45c,
-   0xd23c32ba5324a326, 0xadd5a66d4a17a344,
-   0x08c9f2afa63e1db5, 0x563c6b91983d5983,
-   0x4d608672a17cf84c, 0xf6c76e08cc3ee246,
-   0x5e76bcb1b333982f, 0x2ae6c4efa566d62b,
-   0x36d4c1bee8b6f406, 0x6321efbc1582ee74,
-   0x69c953f40d4ec1fd, 0x26585806c45a7da7,
-   0x16fae0061614c17e, 0x3f9d63283daf907e,
-   0x0cd29b00e3f2c9d2, 0x300cd4b730ceaa5f,
-   0x9832e0f216512a74, 0x9af8cee3d830eb0d,
-   0x9279f1b57b9ec54b, 0xd36886046ee651ff,
-   0x316796e6574d239b, 0x05750a17f3a6e6cc,
-   0xce6c3213d98176b1, 0x62a205f88452173c,
-   0x47154778b3cb2bf4, 0x486a9323825446ff,
-   0x65655e4e0758df38, 0x8e5086fc897cfcf2,
-   0x86ca0bd0442e7031, 0x4e477830a20940f0,
-   0x8338f7d139eea065, 0xbd3a2ce437e95ef7,
-   0x6ff8130126b29721, 0xe7de9fefd1ed44a3,
-   0xd992257615dfa08b, 0xbe42dc12f6f7853c,
-   0x7eb027ab7ceca7d8, 0xdea83eaada7d8d53,
-   0xd86902bd93ce25aa, 0xf908731afd43f65a,
-   0xa5194a17daef5fc0, 0x6a21fd4c33664d97,
-   0x701541db3198b435, 0x9b54cdedbb0f1eea,
-   0x72409751a163d09a, 0xe26f4791bf9d75f6
-};
+#if SPH_JH_64

-// Big endian version
-
-/*
 static const sph_u64 C[] = {
 	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
 	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
@@ -282,7 +240,6 @@ static const sph_u64 C[] = {
 	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
 	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
 };
-*/

 #define Ceven_hi(r)   (C[((r) << 2) + 0])
 #define Ceven_lo(r)   (C[((r) << 2) + 1])
@@ -470,7 +427,7 @@ do { \
   h7h = _mm256_xor_si256( h7h, m3h ); \
   h7l = _mm256_xor_si256( h7l, m3l ); \

-/*
+
 static const sph_u64 IV256[] = {
 	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
 	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
@@ -493,8 +450,11 @@ static const sph_u64 IV512[] = {
 	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
 	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
 };
-*/

+#else
+
+
+#endif

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -524,6 +484,57 @@ static const sph_u64 IV512[] = {
 		W ## ro(h7); \
 	} while (0)

+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define E8_8W   do { \
+      unsigned r; \
+      for (r = 0; r < 42; r += 7) { \
+         SL_8W(0); \
+         SL_8W(1); \
+         SL_8W(2); \
+         SL_8W(3); \
+         SL_8W(4); \
+         SL_8W(5); \
+         SL_8W(6); \
+      } \
+   } while (0)
+
+#endif
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -574,7 +585,6 @@ static const sph_u64 IV512[] = {

 #endif  // AVX512

-
 #define E8   do { \
      SLu( 0, 0); \
      SLu( 1, 1); \
@@ -620,6 +630,13 @@ static const sph_u64 IV512[] = {
      SLu(41, 6); \
   } while (0)

+#else
+
+
+#endif
+
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 void jh256_8way_init( jh_8way_context *sc )
@@ -715,12 +732,12 @@ jh_8way_core( jh_8way_context *sc, const void *data, size_t len )

 static void
 jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32 )
+               size_t out_size_w32, const void *iv )
 {
   __m512i buf[16*4];
   __m512i *dst512 = (__m512i*)dst;
   size_t numz, u;
-   uint64_t l0, l1;
+   sph_u64 l0, l1, l0e, l1e;

   buf[0] = m512_const1_64( 0x80ULL );

@@ -731,10 +748,12 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_512( buf+1, (numz>>3) - 1 );

-   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
-   l1 = ( sc->block_count >> 55 );
-   *(buf + (numz>>3)    ) = _mm512_set1_epi64( bswap_64( l1 ) );
-   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( bswap_64( l0 ) );
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm512_set1_epi64( l1e );
+   *(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );

   jh_8way_core( sc, buf, numz + 16 );

@@ -753,7 +772,7 @@ jh256_8way_update(void *cc, const void *data, size_t len)
 void
 jh256_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 8);
+   jh_8way_close(cc, 0, 0, dst, 8, IV256);
 }

 void
@@ -765,7 +784,7 @@ jh512_8way_update(void *cc, const void *data, size_t len)
 void
 jh512_8way_close(void *cc, void *dst)
 {
-   jh_8way_close(cc, 0, 0, dst, 16);
+   jh_8way_close(cc, 0, 0, dst, 16, IV512);
 }

 #endif
@@ -863,12 +882,12 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )

 static void
 jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
-               size_t out_size_w32 )
+               size_t out_size_w32, const void *iv )
 {
   __m256i buf[16*4];
   __m256i *dst256 = (__m256i*)dst;
   size_t numz, u;
-   uint64_t l0, l1;
+   sph_u64 l0, l1, l0e, l1e;

   buf[0] = m256_const1_64( 0x80ULL );

@@ -879,10 +898,12 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,

   memset_zero_256( buf+1, (numz>>3) - 1 );   

-   l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
-   l1 = ( sc->block_count >> 55 );
-   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( bswap_64( l1 ) );
-   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( bswap_64( l0 ) );
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( l1e );
+   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); 

   jh_4way_core( sc, buf, numz + 16 );

@@ -901,7 +922,7 @@ jh256_4way_update(void *cc, const void *data, size_t len)
 void
 jh256_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 8 );
+	jh_4way_close(cc, 0, 0, dst, 8, IV256);
 }

 void
@@ -913,7 +934,7 @@ jh512_4way_update(void *cc, const void *data, size_t len)
 void
 jh512_4way_close(void *cc, void *dst)
 {
-	jh_4way_close(cc, 0, 0, dst, 16 );
+	jh_4way_close(cc, 0, 0, dst, 16, IV512);
 }


--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -43,6 +43,7 @@ extern "C"{
 #endif

 #include <stddef.h>
+#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

 #define SPH_SIZE_jh256   256
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -65,7 +65,7 @@ void jha_hash_4way( void *out, const void *input )
          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

       blake512_4way_init( &ctx_blake );
-       blake512_4way_update( &ctx_blake, vhash, 64 );
+       blake512_4way( &ctx_blake, vhash, 64 );
       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -28,32 +28,30 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;  
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   *noncev = mm512_intrlv_blend_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
   do {
+       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
      keccakhash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) 
+      if ( hash7[ lane<<1 ] <= Htarg ) 
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
-              pdata[19] = bswap_32( n + lane );
+              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
      n += 8;

   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-   pdata[19] = n;
+
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
@@ -81,30 +79,29 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   *noncev = mm256_intrlv_blend_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do {
+       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+	
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ))
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
-              pdata[19] = bswap_32( n + lane );
+              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
          }
      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
+
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-   pdata[19] = n;
+
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,9 +1,5 @@
 #include "keccak-gate.h"
-#include "sph_keccak.h"

-int hard_coded_eb = 1;
-
-// KECCAK

 bool register_keccak_algo( algo_gate_t* gate )
 {
@@ -23,8 +19,6 @@ bool register_keccak_algo( algo_gate_t* gate )
  return true;
 };

-// KECCAKC
-
 bool register_keccakc_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | AVX512_OPT;
@@ -43,50 +37,3 @@ bool register_keccakc_algo( algo_gate_t* gate )
  return true;
 };

-// SHA3D
-
-void sha3d( void *state, const void *input, int len )
-{
-	uint32_t _ALIGN(64) buffer[16], hash[16];
-	sph_keccak_context ctx_keccak;
-
-	sph_keccak256_init( &ctx_keccak );
-	sph_keccak256 ( &ctx_keccak, input, len );
-	sph_keccak256_close( &ctx_keccak, (void*) buffer );
-
-   sph_keccak256_init( &ctx_keccak );
-	sph_keccak256 ( &ctx_keccak, buffer, 32 );
-	sph_keccak256_close( &ctx_keccak, (void*) hash );
-
-	memcpy(state, hash, 32);
-}
-
-void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
-{
-  sha3d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size );
-  for ( int i = 0; i < sctx->job.merkle_count; i++ )
-  {
-     memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
-     sha256d( merkle_root, merkle_root, 64 );
-  }
-}
-
-bool register_sha3d_algo( algo_gate_t* gate )
-{
-  hard_coded_eb = 6;
-//  opt_extranonce = false;
-  gate->optimizations = AVX2_OPT | AVX512_OPT;
-  gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
-#if defined (KECCAK_8WAY)
-  gate->scanhash  = (void*)&scanhash_sha3d_8way;
-  gate->hash      = (void*)&sha3d_hash_8way;
-#elif defined (KECCAK_4WAY)
-  gate->scanhash  = (void*)&scanhash_sha3d_4way;
-  gate->hash      = (void*)&sha3d_hash_4way;
-#else
-  gate->scanhash  = (void*)&scanhash_sha3d;
-  gate->hash      = (void*)&sha3d_hash;
-#endif
-  return true;
-};
-
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -10,37 +10,24 @@
  #define KECCAK_4WAY 1
 #endif

-extern int hard_coded_eb;
-
 #if defined(KECCAK_8WAY)

 void keccakhash_8way( void *state, const void *input );
 int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash_8way( void *state, const void *input );
-int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
 #elif defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash_4way( void *state, const void *input );
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr );
-
 #else

 void keccakhash( void *state, const void *input );
 int scanhash_keccak( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );

-void sha3d_hash( void *state, const void *input );
-int scanhash_sha3d( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr );
+#endif

 #endif
-#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -1,7 +1,6 @@
 #include <stddef.h>
 #include <stdint.h>
 #include "keccak-hash-4way.h"
-#include "keccak-gate.h"

 static const uint64_t RC[] = {
        0x0000000000000001, 0x0000000000008082,
@@ -164,12 +163,12 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    unsigned eb;
    union {
       __m512i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
+       sph_u64 dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m512_len = byte_len >> 3;

-    eb = hard_coded_eb;
+    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
@@ -345,12 +344,12 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    unsigned eb;
    union {
       __m256i tmp[lim + 1];
-       uint64_t dummy;   /* for alignment */
+       sph_u64 dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m256_len = byte_len >> 3;

-    eb = hard_coded_eb;
+    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -43,8 +43,16 @@ extern "C"{
 #ifdef  __AVX2__

 #include <stddef.h>
+#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

+#define SPH_SIZE_keccak256   256
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512   512
+
 /**
 * This structure is a context for Keccak computations: it contains the
 * intermediate values and some data from the last entered block. Once a
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -18,34 +18,36 @@ void keccakhash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_keccak( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr )
+int scanhash_keccak( struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) hash64[8];
-   uint32_t _ALIGN(64) endiandata[32];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   const int thr_id = mythr->id;
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	//const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   for ( int i=0; i < 19; i++ )
-      be32enc( &endiandata[i], pdata[i] );
+	uint32_t _ALIGN(32) hash64[8];
+	uint32_t endiandata[32];

-   do {
-      be32enc( &endiandata[19], n );
-      keccakhash( hash64, endiandata );
-      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-      }
-      n++;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
+        for (int i=0; i < 19; i++) 
+                be32enc(&endiandata[i], pdata[i]);

-   *hashes_done = n - first_nonce;
-   pdata[19] = n;
-   return 0;
+	do {
+	
+		pdata[19] = ++n;
+		be32enc(&endiandata[19], n); 
+		keccakhash(hash64, endiandata);
+        if (((hash64[7]&0xFFFFFF00)==0) && 
+				fulltest(hash64, ptarget)) {
+            *hashes_done = n - first_nonce + 1;
+			return true;
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
 }

--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -1,126 +0,0 @@
-#include "keccak-gate.h"
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include "sph_keccak.h"
-#include "keccak-hash-4way.h"
-
-#if defined(KECCAK_8WAY)
-
-void sha3d_hash_8way(void *state, const void *input)
-{
-    uint32_t buffer[16*8] __attribute__ ((aligned (128)));
-    keccak256_8way_context ctx;
-
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, input, 80 );
-    keccak256_8way_close( &ctx, buffer );
-
-    keccak256_8way_init( &ctx );
-    keccak256_8way_update( &ctx, buffer, 32 );
-    keccak256_8way_close( &ctx, state );
-}
-
-int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[24*8] __attribute__ ((aligned (128)));
-   uint32_t hash[16*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[49]);   // 3*16+1
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 8;
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;  
-   const bool bench = opt_benchmark;
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   *noncev = mm512_intrlv_blend_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
-   do {
-      sha3d_hash_8way( hash, vdata );
-
-      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
-      {
-          extr_lane_8x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
-          {
-              pdata[19] = bswap_32( n + lane );
-              submit_lane_solution( work, lane_hash, mythr, lane );
-          }
-      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
-      n += 8;
-
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#elif defined(KECCAK_4WAY)
-
-void sha3d_hash_4way(void *state, const void *input)
-{
-    uint32_t buffer[16*4] __attribute__ ((aligned (64)));
-    keccak256_4way_context ctx;
-
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, buffer );
-
-    keccak256_4way_init( &ctx );
-    keccak256_4way_update( &ctx, buffer, 32 );
-    keccak256_4way_close( &ctx, state );
-}
-
-int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t hash[16*4] __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash7 = &(hash[25]);   // 3*8+1
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
-   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-   const uint32_t Htarg = ptarget[7];
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-
-   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   *noncev = mm256_intrlv_blend_32( 
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-   do {
-      sha3d_hash_4way( hash, vdata );
-
-      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
-      {
-          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( valid_hash( lane_hash, ptarget ) )
-          {
-              pdata[19] = bswap_32( n + lane );
-              submit_lane_solution( work, lane_hash, mythr, lane );
-          }
-      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
-      n += 4;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-
-#endif
--- a/algo/keccak/sha3d.c
+++ b/algo/keccak/sha3d.c
@@ -1,50 +0,0 @@
-#include "algo-gate-api.h"
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include "sph_keccak.h"
-
-void sha3d_hash(void *state, const void *input)
-{
-    uint32_t buffer[16];
-    sph_keccak256_context ctx_keccak;
-   
-    sph_keccak256_init( &ctx_keccak );
-    sph_keccak256 ( &ctx_keccak, input, 80 );
-    sph_keccak256_close( &ctx_keccak, buffer );
-    sph_keccak256_init( &ctx_keccak );
-    sph_keccak256 ( &ctx_keccak, buffer, 32 );
-    sph_keccak256_close( &ctx_keccak, state );
-}
-
-int scanhash_sha3d( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(64) hash64[8];
-   uint32_t _ALIGN(64) endiandata[32];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19];
-	const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   const int thr_id = mythr->id;
-
-   for ( int i=0; i < 19; i++ ) 
-      be32enc( &endiandata[i], pdata[i] );
-
-	do {
-		be32enc( &endiandata[19], n ); 
-		sha3d_hash( hash64, endiandata );
-      if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
-      {
-         pdata[19] = n;
-         submit_solution( work, hash64, mythr );
-		}
-      n++;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
-	
-	*hashes_done = n - first_nonce;
-	pdata[19] = n;
-	return 0;
-}
-
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -32,8 +32,8 @@

 #include <stddef.h>
 #include <string.h>
+
 #include "sph_keccak.h"
-#include "keccak-gate.h"

 #ifdef __cplusplus
 extern "C"{
@@ -1616,7 +1616,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		} u; \
 		size_t j; \
 \
-		eb = hard_coded_eb; \
+		eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
 		if (kc->ptr == (lim - 1)) { \
 			if (n == 7) { \
 				u.tmp[0] = eb; \
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -459,11 +459,6 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
    return 0;
 }

-int luffa512_4way_init( luffa_4way_context *state )
-{
-   return luffa_4way_init( state, 512 );
-}
-   
 // Do not call luffa_update_close after having called luffa_update.
 // Once luffa_update has been called only call luffa_update or luffa_close.
 int luffa_4way_update( luffa_4way_context *state, const void *data,
@@ -501,14 +496,6 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    return 0;
 }

-/*
-int luffa512_4way_update( luffa_4way_context *state, const void *data,
-                       size_t len )
-{
-   return luffa_4way_update( state, data, len );
-}
-*/
-
 int luffa_4way_close( luffa_4way_context *state, void *hashval )
 {
    __m512i *buffer = (__m512i*)state->buffer;
@@ -531,77 +518,6 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
    return 0;
 }

-/*
-int luffa512_4way_close( luffa_4way_context *state, void *hashval )
-{
-   return luffa_4way_close( state, hashval );
-}
-*/
-
-int luffa512_4way_full( luffa_4way_context *state, void *output,
-                        const void *data, size_t inlen )
-{
-    state->hashbitlen = 512;
-    __m128i *iv = (__m128i*)IV;
-
-    state->chainv[0] = m512_const1_128( iv[0] );
-    state->chainv[1] = m512_const1_128( iv[1] );
-    state->chainv[2] = m512_const1_128( iv[2] );
-    state->chainv[3] = m512_const1_128( iv[3] );
-    state->chainv[4] = m512_const1_128( iv[4] );
-    state->chainv[5] = m512_const1_128( iv[5] );
-    state->chainv[6] = m512_const1_128( iv[6] );
-    state->chainv[7] = m512_const1_128( iv[7] );
-    state->chainv[8] = m512_const1_128( iv[8] );
-    state->chainv[9] = m512_const1_128( iv[9] );
-
-    ((__m512i*)state->buffer)[0] = m512_zero;
-    ((__m512i*)state->buffer)[1] = m512_zero;
-
-    const __m512i *vdata  = (__m512i*)data;
-    __m512i msg[2];
-    int i;
-    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = m512_const_64(
-                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
-                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
-                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-    state->rembytes = inlen & 0x1F;
-
-    // full blocks
-    for ( i = 0; i < blocks; i++, vdata+=2 )
-    {
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
-       rnd512_4way( state, msg );
-    }
-
-    // 16 byte partial block exists for 80 byte len
-    if ( state->rembytes  )
-    {
-       // padding of partial block
-       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m512_const2_64( 0, 0x0000000080000000 );
-       rnd512_4way( state, msg );
-    }
-    else
-    {
-       // empty pad block
-       msg[0] = m512_const2_64( 0, 0x0000000080000000 );
-       msg[1] = m512_zero;
-       rnd512_4way( state, msg );
-    }
-
-    finalization512_4way( state, (uint32*)output );
-
-    if ( state->hashbitlen > 512 )
-        finalization512_4way( state, (uint32*)( output+64 ) );
-
-    return 0;
-}
-
 int luffa_4way_update_close( luffa_4way_context *state,
                 void *output, const void *data, size_t inlen )
 {
@@ -1115,69 +1031,6 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
    return 0;
 }

-int luffa512_2way_full( luffa_2way_context *state, void *output,
-                        const void *data, size_t inlen )
-{
-    state->hashbitlen = 512;
-    __m128i *iv = (__m128i*)IV;
-
-    state->chainv[0] = m256_const1_128( iv[0] );
-    state->chainv[1] = m256_const1_128( iv[1] );
-    state->chainv[2] = m256_const1_128( iv[2] );
-    state->chainv[3] = m256_const1_128( iv[3] );
-    state->chainv[4] = m256_const1_128( iv[4] );
-    state->chainv[5] = m256_const1_128( iv[5] );
-    state->chainv[6] = m256_const1_128( iv[6] );
-    state->chainv[7] = m256_const1_128( iv[7] );
-    state->chainv[8] = m256_const1_128( iv[8] );
-    state->chainv[9] = m256_const1_128( iv[9] );
-
-    ((__m256i*)state->buffer)[0] = m256_zero;
-    ((__m256i*)state->buffer)[1] = m256_zero;
-
-    const __m256i *vdata  = (__m256i*)data;
-    __m256i msg[2];
-    int i;
-    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
-
-    state->rembytes = inlen & 0x1F;
-
-    // full blocks
-    for ( i = 0; i < blocks; i++, vdata+=2 )
-    {
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
-       rnd512_2way( state, msg );
-    }
-
-    // 16 byte partial block exists for 80 byte len
-    if ( state->rembytes  )
-    {
-       // padding of partial block
-       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m256_const2_64( 0, 0x0000000080000000 );
-       rnd512_2way( state, msg );
-    }
-    else
-    {
-       // empty pad block
-       msg[0] = m256_const2_64( 0, 0x0000000080000000 );
-       msg[1] = m256_zero;
-       rnd512_2way( state, msg );
-    }
-
-    finalization512_2way( state, (uint32*)output );
-
-    if ( state->hashbitlen > 512 )
-        finalization512_2way( state, (uint32*)( output+32 ) );
-
-    return 0;
-}
-
 int luffa_2way_update_close( luffa_2way_context *state,
                 void *output, const void *data, size_t inlen )
 {
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -61,23 +61,11 @@ typedef struct {
 } luffa_4way_context __attribute((aligned(128)));

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
-//int luffa_4way_update( luffa_4way_context *state, const void *data,
-//                       size_t len );
-//int luffa_4way_close( luffa_4way_context *state, void *hashval );
+int luffa_4way_update( luffa_4way_context *state, const void *data,
+                       size_t len );
+int luffa_4way_close( luffa_4way_context *state, void *hashval );
 int luffa_4way_update_close( luffa_4way_context *state, void *output,
                                   const void *data, size_t inlen );
-int luffa512_4way_full( luffa_4way_context *state, void *output,
-                         const void *data, size_t inlen );
-int luffa512_4way_init( luffa_4way_context *state );
-int luffa512_4way_update( luffa_4way_context *state, const void *data,
-                       size_t len );
-int luffa512_4way_close( luffa_4way_context *state, void *hashval );
-int luffa512_4way_update_close( luffa_4way_context *state, void *output,
-                                const void *data, size_t inlen );
-
-#define luffa_4way_update       luffa512_4way_update
-#define luffa_4way_close        luffa512_4way_close
-#define luffa_4way_update_close luffa512_4way_update_close

 #endif

@@ -94,8 +82,6 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
 int luffa_2way_close( luffa_2way_context *state, void *hashval );
 int luffa_2way_update_close( luffa_2way_context *state, void *output,
                                   const void *data, size_t inlen );
-int luffa512_2way_full( luffa_2way_context *state, void *output,
-                         const void *data, size_t inlen );

 #endif
 #endif
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -344,62 +344,18 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,

    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
-       // padding of partial block
-       rnd512( state, m128_const_64( 0, 0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
-    else
-       // empty pad block
-       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
-
-    finalization512( state, (uint32*) output );
-    if ( state->hashbitlen > 512 )
-        finalization512( state, (uint32*)( output+128 ) );
-
-    return SUCCESS;
-}
-
-
-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-              const BitSequence* data, size_t inlen )
-{
-// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
-    int i;
-    state->hashbitlen = hashbitlen;
-    /* set the lower 32 bits to '1' */
-    MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
-    /* set all bits to '1' */
-    ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
-    /* set the 32-bit round constant values to the 128-bit data field */
-    for ( i=0; i<32; i++ )
-        CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
-    for ( i=0; i<10; i++ )
-    state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
-    memset(state->buffer, 0, sizeof state->buffer );
-
-    // update
-
-    int blocks = (int)( inlen / 32 );
-    state->rembytes = inlen % 32;
-
-    // full blocks
-    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
-                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
-       data += MSG_BLOCK_BYTE_LEN;
+      // padding of partial block
+      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
+                      mm128_bswap_32( cast_m128i( data ) ) );
+    }
+    else
+    {
+      // empty pad block
+     rnd512( state, _mm_setzero_si128(), 
+                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
    }

-    // final
-
-    // 16 byte partial block exists for 80 byte len
-    if ( state->rembytes  )
-       // padding of partial block
-       rnd512( state, m128_const_64( 0, 0x80000000 ),
-                      mm128_bswap_32( cast_m128i( data ) ) );
-    else
-       // empty pad block
-       rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
-
    finalization512( state, (uint32*) output );
    if ( state->hashbitlen > 512 )
        finalization512( state, (uint32*)( output+128 ) );
@@ -407,7 +363,6 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
    return SUCCESS;
 }

-
 /***************************************************/
 /* Round function         */
 /* state: hash context    */
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -65,6 +65,5 @@ HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
 HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
                                   const BitSequence* data, size_t inlen );

-int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
-                                   const BitSequence* data, size_t inlen );
+

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -262,33 +262,38 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t Htarg = ptarget[7];
   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  // thr_id arg is deprecated

-   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm512_bswap32_intrlv80_16x32( vdata, pdata );
-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
-
   blake256_16way_init( &allium_16way_ctx.blake );
   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );

   do {
-     allium_16way_hash( hash, vdata );
+     *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                 n+11, n+10, n+ 9, n+ 8,
+                                                 n+ 7, n+ 6, n+ 5, n+ 4,
+                                                 n+ 3, n+ 2, n +1, n ) );

-     for ( int lane = 0; lane < 16; lane++ ) 
-     if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
+     allium_16way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 16; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-         pdata[19] = bswap_32( n + lane );
-         submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
     }
-     *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
     n += 16;
-   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
-   pdata[19] = n;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
   *hashes_done = n - first_nonce;
   return 0;
 }
@@ -315,18 +320,18 @@ bool init_allium_8way_ctx()
   return true;
 }

-void allium_8way_hash( void *hash, const void *input )
+void allium_8way_hash( void *state, const void *input )
 {
-   uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
-   uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
-   uint64_t *hash0 = (uint64_t*)hash;
-   uint64_t *hash1 = (uint64_t*)hash+ 4;
-   uint64_t *hash2 = (uint64_t*)hash+ 8;
-   uint64_t *hash3 = (uint64_t*)hash+12;
-   uint64_t *hash4 = (uint64_t*)hash+16;
-   uint64_t *hash5 = (uint64_t*)hash+20;
-   uint64_t *hash6 = (uint64_t*)hash+24;
-   uint64_t *hash7 = (uint64_t*)hash+28;
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (32)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
@@ -393,66 +398,69 @@ void allium_8way_hash( void *hash, const void *input )
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

-   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash4, hash4, 256 );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash5, hash5, 256 );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash6, hash6, 256 );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash7, hash7, 256 );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
 }

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint64_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
-   uint64_t *ptarget = (uint64_t*)work->target;
+   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   const int thr_id = mythr->id;  
-   const bool bench = opt_benchmark;
+   int thr_id = mythr->id;  
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-
   blake256_8way_init( &allium_8way_ctx.blake );
   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );

   do {
-     allium_8way_hash( hash, vdata );
+     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                 n+3, n+2, n+1, n ) );

-     for ( int lane = 0; lane < 8; lane++ )
+     allium_8way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
     {
-        const uint64_t *lane_hash = hash + (lane<<2);
-        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
        {
-           pdata[19] = bswap_32( n + lane );
-           submit_lane_solution( work, lane_hash, mythr, lane );
-        }
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
     }
     n += 8;
-     *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
-   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -194,7 +194,7 @@ bool register_allium_algo( algo_gate_t* gate )

 /////////////////////////////////////////

-bool phi2_has_roots = false;
+bool phi2_has_roots;
 bool phi2_use_roots = false;

 int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
@@ -220,7 +220,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   // Assemble block header
   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
                  (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
-                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits), NULL );
+                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
   for ( t = 0; t < 16; t++ )
      g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
 }
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -189,7 +189,7 @@ bool init_allium_ctx();
 //  #define PHI2_4WAY
 #endif

-extern bool phi2_has_roots;
+bool phi2_has_roots;

 bool register_phi2_algo( algo_gate_t* gate );
 #if defined(PHI2_4WAY)
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -33,7 +33,7 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
-     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -260,8 +260,8 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
 // Overlap has 2 in Nrows chance reduced to 1 in Nrows because if both
 // overlap it's unified.
 // As a result normal is Nrows-2 / Nrows.
-// for 4 rows: 1 unified, 2 overlap, 1 normal.
-// for 8 rows: 1 unified, 2 overlap, 56 normal.
+// for 4 rows: 1 unified, 1 overlap, 2 normal.
+// for 8 rows: 1 unified, 1 overlap, 6 normal.

 static inline void reducedDuplexRow_2way_normal( uint64_t *State,
                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
@@ -338,18 +338,21 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
   _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

+
+
+// rowInOut0 ! = rowInOut1 != rowOut
 static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols)
 {
+
   int i;
   register __m512i state0, state1, state2, state3;
   __m512i *in = (__m512i*)rowIn;
   __m512i *inout0 = (__m512i*)rowInOut0;
   __m512i *inout1 = (__m512i*)rowInOut1;
   __m512i *out = (__m512i*)rowOut;
-//   inout_ovly io;
-   ovly_512 io0, io1, io2;
+   inout_ovly io;

   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -359,21 +362,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
-     io0.v512 = _mm512_mask_blend_epi64( 0xf0,
-                                  _mm512_load_si512( (__m512i*)inout0 ),
-                                  _mm512_load_si512( (__m512i*)inout1 ) );
-     io1.v512 = _mm512_mask_blend_epi64( 0xf0,
-                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
-                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
-     io2.v512 = _mm512_mask_blend_epi64( 0xf0,
-                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
-                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
-
-     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
-     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
-     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
-     
-/* 
     io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
                                  _mm512_load_si512( (__m512i*)inout0 ),
                                  _mm512_load_si512( (__m512i*)inout1 ) );
@@ -387,7 +375,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
-*/

     //Applies the reduced-round transformation f to the sponge's state
     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -401,21 +388,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
       out[2] = _mm512_xor_si512( out[2], state2 );

       // if out is the same row as inout, update with new data.
-       if ( rowOut == rowInOut0 )
-       {
-          io0.v512 = _mm512_mask_blend_epi64( 0x0f, io0.v512, out[0] );
-          io1.v512 = _mm512_mask_blend_epi64( 0x0f, io1.v512, out[1] );
-          io2.v512 = _mm512_mask_blend_epi64( 0x0f, io2.v512, out[2] );
-
-       }
-       if ( rowOut == rowInOut1 )
-       {
-          io0.v512 = _mm512_mask_blend_epi64( 0xf0, io0.v512, out[0] );
-          io1.v512 = _mm512_mask_blend_epi64( 0xf0, io1.v512, out[1] );
-          io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
-       }
-
-/*
       if ( rowOut == rowInOut0 )
       {
          io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
@@ -429,35 +401,27 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
          io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
          io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
       }
-*/

       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
       t0 = _mm512_permutex_epi64( state0, 0x93 );
       t1 = _mm512_permutex_epi64( state1, 0x93 );
       t2 = _mm512_permutex_epi64( state2, 0x93 );

-       io0.v512 = _mm512_xor_si512( io0.v512,
+       io.v512[0] = _mm512_xor_si512( io.v512[0],
                                 _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
-       io1.v512 = _mm512_xor_si512( io1.v512,
+       io.v512[1] = _mm512_xor_si512( io.v512[1],
                                 _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
-       io2.v512 = _mm512_xor_si512( io2.v512,
+       io.v512[2] = _mm512_xor_si512( io.v512[2],
                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
     }

-      casti_m256i( inout0, 0 ) = io0.v256lo;
-      casti_m256i( inout1, 1 ) = io0.v256hi;
-      casti_m256i( inout0, 2 ) = io1.v256lo;
-      casti_m256i( inout1, 3 ) = io1.v256hi;
-      casti_m256i( inout0, 4 ) = io2.v256lo;
-      casti_m256i( inout1, 5 ) = io2.v256hi;
-/*     
     _mm512_mask_store_epi64( inout0,    0x0f, io.v512[0] );
     _mm512_mask_store_epi64( inout1,    0xf0, io.v512[0] );
     _mm512_mask_store_epi64( inout0 +1, 0x0f, io.v512[1] );
     _mm512_mask_store_epi64( inout1 +1, 0xf0, io.v512[1] );
     _mm512_mask_store_epi64( inout0 +2, 0x0f, io.v512[2] );
     _mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] );
-*/
+
      //Goes to next block
      in     += BLOCK_LEN_M256I;
      inout0 += BLOCK_LEN_M256I;
@@ -602,7 +566,7 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
       inout[1] = _mm512_xor_si512( inout[1],
                                    _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
       inout[2] = _mm512_xor_si512( inout[2],
-                                    _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+                                     _mm512_mask_blend_epi64( 0x11, t2, t1 ) );

       out[0] = _mm512_xor_si512( out[0], state0 );
       out[1] = _mm512_xor_si512( out[1], state1 );
@@ -611,9 +575,9 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
     }

     //Goes to next block
-     in    += BLOCK_LEN_M256I;
+     in     += BLOCK_LEN_M256I;
     inout += BLOCK_LEN_M256I;
-     out   += BLOCK_LEN_M256I;
+     out    += BLOCK_LEN_M256I;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
@@ -636,8 +600,8 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
 
 //  Wrapper
 inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
-                                   uint64_t *rowInOut0, uint64_t *rowInOut1,
-                                   uint64_t *rowOut, uint64_t nCols )
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols )
 {
  if ( rowInOut0 == rowInOut1 )
     reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
@@ -650,18 +614,18 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
 }

 inline void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
-                                     uint64_t *rowInOut0, uint64_t *rowInOut1,
-                                     uint64_t *rowOut, uint64_t nCols )
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols )
 {
-   if ( rowInOut0 == rowInOut1 )
+    if ( rowInOut0 == rowInOut1 )
      reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
-   else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
-   {
-      asm volatile ( "nop" );  // Prevent GCC from optimizing
-      reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1,
-                                       rowOut, nCols );
-   }
-   else
+    else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+    {
+       asm ( "nop" );  // This prevents GCC from merging with previous function
+       reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1,
+                                      rowOut, nCols );
+    }
+    else
      reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
                                    rowOut, nCols );
 }
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -203,18 +203,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-union _ovly_512
-{
-  __m512i v512;
-  struct
-  {
-     __m256i v256lo;
-     __m256i v256hi;
-  };
-};
-typedef union _ovly_512 ovly_512;
-
-
 union _inout_ovly
 {
   __m512i v512[3];
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -149,7 +149,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
    char data_str[161], hash_str[65], target_str[65];
    //uint8_t *bdata = 0;
    uint8_t bdata[8192] __attribute__ ((aligned (64)));
-    int i, digits;
+    int rc = 0, i, digits;
    int bytes;
    size_t p = sizeof(unsigned long), a = 64/p, b = 32/p;

@@ -267,41 +267,48 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
        }

-
-        if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) 
-             && !opt_benchmark ) )
-
-
-//        if ( unlikely( hash[7] <= ptarget[7] ) )
-//        if ( likely( fulltest( hash, ptarget ) && !opt_benchmark ) )        
+// rewrite to use 64 bit test.        
+        const unsigned char *hash_ = (const unsigned char *)hash;
+        const unsigned char *target_ = (const unsigned char *)ptarget;
+        for ( i = 31; i >= 0; i-- )
        {
-           if ( opt_debug )
+	        if ( hash_[i] != target_[i] )
           {
-                bin2hex( hash_str, (unsigned char *)hash, 32 );
-                bin2hex( target_str, (unsigned char *)ptarget, 32 );
-                bin2hex( data_str, (unsigned char *)data, 80 );
-                applog( LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata   %s\nhash   %s\ntarget %s",
-                      thr_id, data_str, hash_str, target_str );
+		        rc = hash_[i] < target_[i];
+		        break;
+	        }
+        }
+        if ( unlikely(rc) )
+        {
+            if ( opt_debug )
+            {
+                bin2hex(hash_str, (unsigned char *)hash, 32);
+                bin2hex(target_str, (unsigned char *)ptarget, 32);
+                bin2hex(data_str, (unsigned char *)data, 80);
+                applog(LOG_DEBUG, "DEBUG: [%d thread] Found share!\ndata   %s\nhash   %s\ntarget %s", thr_id, 
+                    data_str,
+                    hash_str,
+                    target_str);
            }
            pdata[19] = data[19];
            submit_solution( work, hash, mythr );
        }
-    } while ( n < max_nonce && !work_restart[thr_id].restart );
+    } while (n < max_nonce && !work_restart[thr_id].restart);

     pdata[19] = n;

-     mpf_set_prec_raw( magifpi, prec0 );
-     mpf_set_prec_raw( magifpi0, prec0 );
-     mpf_set_prec_raw( mptmp, prec0 );
-     mpf_set_prec_raw( mpt1, prec0 );
-     mpf_set_prec_raw( mpt2, prec0 );
-     mpf_clear( magifpi );
-     mpf_clear( magifpi0 );
-     mpf_clear( mpten );
-     mpf_clear( mptmp );
-     mpf_clear( mpt1 );
-     mpf_clear( mpt2 );
-     mpz_clears( magipi, magisw, product, bns0, bns1, NULL );
+     mpf_set_prec_raw(magifpi, prec0);
+     mpf_set_prec_raw(magifpi0, prec0);
+     mpf_set_prec_raw(mptmp, prec0);
+     mpf_set_prec_raw(mpt1, prec0);
+     mpf_set_prec_raw(mpt2, prec0);
+     mpf_clear(magifpi);
+     mpf_clear(magifpi0);
+     mpf_clear(mpten);
+     mpf_clear(mptmp);
+     mpf_clear(mpt1);
+     mpf_clear(mpt2);
+     mpz_clears(magipi, magisw, product, bns0, bns1, NULL);

    *hashes_done = n - first_nonce + 1;
    return 0;
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -154,13 +154,14 @@ int scanhash_zr5( struct work *work, uint32_t max_nonce,
 }

 void zr5_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                       uint32_t* end_nonce_ptr )
+                       uint32_t* end_nonce_ptr, bool clean_job )
 {
   // ignore POK in first word
+// const int nonce_i = 19;
   const int wkcmp_sz = 72;  // (19-1) * sizeof(uint32_t)
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
   if ( memcmp( &work->data[1], &g_work->data[1], wkcmp_sz )
-      || ( *nonceptr >= *end_nonce_ptr ) )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) )
   {
      work_free( work );
      work_copy( work, g_work );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -94,37 +94,6 @@ static const uint32_t K256[64] =
   _mm_xor_si128( _mm_xor_si128( \
        mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )

-#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
-do { \
-  __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
-  __m128i T1 = mm128_ror_32( E, 14 ); \
-  __m128i T2 = mm128_ror_32( A,  9 ); \
-  __m128i T3 = _mm_xor_si128( F, G ); \
-  __m128i T4 = _mm_or_si128( A, B ); \
-  __m128i T5 = _mm_and_si128( A, B ); \
-  K  = _mm_add_epi32( K, W[i] ); \
-  T1 = _mm_xor_si128( T1, E ); \
-  T2 = _mm_xor_si128( T2, A ); \
-  T3 = _mm_and_si128( T3, E ); \
-  T4 = _mm_and_si128( T4, C ); \
-  K  = _mm_add_epi32( H, K ); \
-  T1 = mm128_ror_32( T1,  5 ); \
-  T2 = mm128_ror_32( T2, 11 ); \
-  T3 = _mm_xor_si128( T3, G ); \
-  T4 = _mm_or_si128( T4, T5 ); \
-  T1 = _mm_xor_si128( T1, E ); \
-  T2 = _mm_xor_si128( T2, A ); \
-  T1 = mm128_ror_32( T1,  6 ); \
-  T2 = mm128_ror_32( T2,  2 ); \
-  T1 = _mm_add_epi32( T1, T3 ); \
-  T2 = _mm_add_epi32( T2, T4 ); \
-  T1 = _mm_add_epi32( T1, K ); \
-  H  = _mm_add_epi32( T1, T2 ); \
-  D  = _mm_add_epi32( D, T1 ); \
-} while (0)
-
-
-/*
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
  __m128i T1, T2; \
@@ -135,8 +104,6 @@ do { \
  D  = _mm_add_epi32( D,  T1 ); \
  H  = _mm_add_epi32( T1, T2 ); \
 } while (0)
-*/
-

 static void
 sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -319,7 +319,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )

 // SHA-512 4 way 64 bit

-/*
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -327,15 +327,6 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
   _mm256_or_si256( _mm256_and_si256( X, Y ), \
                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )

-#define BSG5_0(x) \
-  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
-                   _mm256_xor_si256( mm256_ror_64( x,  5 ), x ), 6 ), x ), 28 )
-
-#define BSG5_1(x) \
-  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
-                   _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
-*/
-/*
 #define BSG5_0(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
        mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
@@ -343,8 +334,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 #define BSG5_1(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
        mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
-*/
-/*
+
 #define SSG5_0(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
        mm256_ror_64(x,  1), mm256_ror_64(x,  8) ), _mm256_srli_epi64(x, 7) ) 
@@ -352,7 +342,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
 #define SSG5_1(x) \
   _mm256_xor_si256( _mm256_xor_si256( \
        mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
-*/
+
 // Interleave SSG0 & SSG1 for better throughput.
 // return ssg0(w0) + ssg1(w1)
 static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
@@ -371,7 +361,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
   return _mm256_add_epi64( w0a, w1a );
 }

-/*
+
 #define SSG512x2_0( w0, w1, i ) do \
 { \
   __m256i X0a, X1a, X0b, X1b; \
@@ -401,51 +391,7 @@ static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
  w0  = _mm256_xor_si256( X0a, X0b ); \
  w1  = _mm256_xor_si256( X1a, X1b ); \
 } while(0)
-*/

-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
-  __m256i T1 = mm256_ror_64( E, 23 ); \
-  __m256i T2 = mm256_ror_64( A,  5 ); \
-  __m256i T3 = _mm256_xor_si256( F, G ); \
-  __m256i T4 = _mm256_or_si256( A, B ); \
-  __m256i T5 = _mm256_and_si256( A, B ); \
-  K = _mm256_add_epi64( K, W[i] ); \
-  T1 = _mm256_xor_si256( T1, E ); \
-  T2 = _mm256_xor_si256( T2, A ); \
-  T3 = _mm256_and_si256( T3, E ); \
-  T4 = _mm256_and_si256( T4, C ); \
-  K = _mm256_add_epi64( H, K ); \
-  T1 = mm256_ror_64( T1, 4 ); \
-  T2 = mm256_ror_64( T2, 6 ); \
-  T3 = _mm256_xor_si256( T3, G ); \
-  T4 = _mm256_or_si256( T4, T5 ); \
-  T1 = _mm256_xor_si256( T1, E ); \
-  T2 = _mm256_xor_si256( T2, A ); \
-  T1 = mm256_ror_64( T1, 14 ); \
-  T2 = mm256_ror_64( T2, 28 ); \
-  T1 = _mm256_add_epi64( T1, T3 ); \
-  T2 = _mm256_add_epi64( T2, T4 ); \
-  T1 = _mm256_add_epi64( T1, K ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-  D  = _mm256_add_epi64( D, T1 ); \
-} while (0)
-
-/*
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  __m256i K = _mm256_add_epi64( W[i], _mm256_set1_epi64x( K512[ i ] ) ); \
-  __m256i T1 = BSG5_1(E); \
-  __m256i T2 = BSG5_0(A); \
-  T1 = mm256_add4_64( T1, H, CH(E, F, G), K ); \
-  T2 = _mm256_add_epi64( T2, MAJ(A, B, C) ); \
-  D  = _mm256_add_epi64( D, T1 ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-} while (0)
-*/
-
-/*
 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
  __m256i T1, T2; \
@@ -456,7 +402,7 @@ do { \
  D  = _mm256_add_epi64( D, T1 ); \
  H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)
-*/
+

 static void
 sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,9 +3,11 @@

 #include <stdio.h>

-// This is a fake, it actually does not do parallel AES, that requires VAES.
-// This is only intended when the preceding and folllowing functions use the
-// same 2x128 interleave.
+// This implementation is deprecated, superseded by VAES in Icelake
+// which provides HW based 4 way aes.
+// It was created for AVX2 to eliminate interleaving between the 
+// preceding and following function.
+// This code can be removed when current users have reverted to one way.

 #if defined(__AVX2__)

@@ -408,94 +410,4 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
   casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
 }

-void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
-                           const void *data, size_t len )
-{
-    __m256i *h = (__m256i*)ctx->h;
-    __m128i *iv = (__m128i*)IV512;
-
-   h[0] = m256_const1_128( iv[0] );
-   h[1] = m256_const1_128( iv[1] );
-   h[2] = m256_const1_128( iv[2] );
-   h[3] = m256_const1_128( iv[3] );
-
-   ctx->ptr    =
-   ctx->count0 =
-   ctx->count1 =
-   ctx->count2 =
-   ctx->count3 = 0;
-
-   unsigned char *buf = ctx->buf;
-   size_t         ptr = ctx->ptr;
-
-   // process full blocks and load buf with remainder.
-   while ( len > 0 )
-   {
-      size_t clen;
-
-      clen = (sizeof ctx->buf) - ptr;
-      if ( clen > len << 1 )
-         clen = len << 1;
-      memcpy( buf + ptr, data, clen );
-      data = (const unsigned char *)data + clen;
-      ptr += clen;
-      len -= (clen >> 1);
-      if ( ptr == sizeof ctx->buf )
-      {
-         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
-         {
-             ctx->count1 = ctx->count1 + 1;
-             if ( ctx->count1 == 0 )
-             {
-                ctx->count2 = ctx->count2 + 1;
-                if ( ctx->count2 == 0 )
-                   ctx->count3 = ctx->count3 + 1;
-             }
-         }
-         c512_2way( ctx, buf );
-         ptr = 0;
-      }
-   }
-
-   uint32_t vp = ptr>>5;
-   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
-   // Count is misaligned to 16 bits and straddles 2 vectors.
-   // Use u32 overlay to stage then u16 to load buf.
-   union
-   {
-      uint32_t u32[4];
-      uint16_t u16[8];
-   } count;
-
-   count.u32[0] = ctx->count0 += (ptr << 2);  // ptr/2 * 8
-   count.u32[1] = ctx->count1;
-   count.u32[2] = ctx->count2;
-   count.u32[3] = ctx->count3;
-
-   if ( vp == 0 )    // empty buf, xevan.
-   {
-      casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
-      memset_zero_256( (__m256i*)buf + 1, 5 );
-      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
-   }
-   else     // half full buf, everyone else.
-   {
-    casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
-      memset_zero_256( (__m256i*)buf + vp, 6 - vp );
-   }
-
-    casti_m256i( buf, 6 ) = m256_const1_128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
-    casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
-                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
-                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
-
-   c512_2way( ctx, buf);
-
-   casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
-   casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
-   casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
-   casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
-}
-   
 #endif // AVX2
--- a/algo/shavite/shavite-hash-2way.h
+++ b/algo/shavite/shavite-hash-2way.h
@@ -18,8 +18,6 @@ void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
 void shavite512_2way_close( shavite512_2way_context *ctx, void *dst );
 void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
 		                   const void *data, size_t len );
-void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
-                           const void *data, size_t len );

 #endif // AVX2

--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -396,96 +396,4 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
   casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
 }

-
-void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
-                           const void *data, size_t len )
-{
-    __m512i *h = (__m512i*)ctx->h;
-    __m128i *iv = (__m128i*)IV512;
-
-   h[0] = m512_const1_128( iv[0] );
-   h[1] = m512_const1_128( iv[1] );
-   h[2] = m512_const1_128( iv[2] );
-   h[3] = m512_const1_128( iv[3] );
-
-   ctx->ptr    = 
-   ctx->count0 = 
-   ctx->count1 =
-   ctx->count2 =
-   ctx->count3 = 0;
-
-   unsigned char *buf = ctx->buf;
-   size_t         ptr = ctx->ptr;
-
-   // process full blocks and load buf with remainder.
-   while ( len > 0 )
-   {
-      size_t clen;
-
-      clen = (sizeof ctx->buf) - ptr;
-      if ( clen > len << 2 )
-         clen = len << 2;
-      memcpy( buf + ptr, data, clen );
-      data = (const unsigned char *)data + clen;
-      ptr += clen;
-      len -= (clen >> 2);
-      if ( ptr == sizeof ctx->buf )
-      {
-         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
-         {
-             ctx->count1 = ctx->count1 + 1;
-             if ( ctx->count1 == 0 )
-             {
-                ctx->count2 = ctx->count2 + 1;
-                if ( ctx->count2 == 0 )
-                   ctx->count3 = ctx->count3 + 1;
-             }
-         }
-         c512_4way( ctx, buf );
-         ptr = 0;
-      }
-   }
-
-   uint32_t vp = ptr>>6;
-   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
-   // Count is misaligned to 16 bits and straddles 2 vectors.
-   // Use u32 overlay to stage then u16 to load buf.
-   union
-   {
-      uint32_t u32[4];
-      uint16_t u16[8];
-   } count;
-
-   count.u32[0] = ctx->count0 += (ptr << 1);  // ptr/4 * 8
-   count.u32[1] = ctx->count1;
-   count.u32[2] = ctx->count2;
-   count.u32[3] = ctx->count3;
-
-   if ( vp == 0 )    // empty buf, xevan.
-   {
-      casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
-      memset_zero_512( (__m512i*)buf + 1, 5 );
-      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
-   }
-   else     // half full buf, everyone else.
-   {
-    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
-      memset_zero_512( (__m512i*)buf + vp, 6 - vp );
-   }
-
-    casti_m512i( buf, 6 ) = m512_const1_128(
-                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
-    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
-                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
-                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
-
-   c512_4way( ctx, buf);
-
-   casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
-   casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
-   casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
-   casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
-}
-
-
 #endif // VAES
--- a/algo/shavite/shavite-hash-4way.h
+++ b/algo/shavite/shavite-hash-4way.h
@@ -18,8 +18,6 @@ void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
 void shavite512_4way_close( shavite512_4way_context *ctx, void *dst );
 void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
 		                   const void *data, size_t len );
-void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
-                           const void *data, size_t len );

 #endif // VAES

--- a/algo/simd/nist.c
+++ b/algo/simd/nist.c
@@ -360,116 +360,18 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval,
  return SUCCESS;
 }

-int simd_full( hashState_sd *state, BitSequence *hashval,
-                            const BitSequence *data, DataLength databitlen )
-{
- 

-  InitIV( state, 512, IV_512 );
- 
-  int current, i;
-  unsigned int bs = state->blocksize;
-  static int align = -1;
-  BitSequence out[64];
-  int isshort = 1;
-  u64 l;
-
-  if (align == -1)
-    align = RequiredAlignment();
-
-#ifdef HAS_64
-  current = state->count & (bs - 1);
-#else
-  current = state->count_low & (bs - 1);
-#endif
-
-  if ( current & 7 )
-  {
-    // The number of hashed bits is not a multiple of 8.
-    // Very painfull to implement and not required by the NIST API.
-    return FAIL;
-  }
-
-  while ( databitlen > 0 )
-  {
-    if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
-    {
-       // We can hash the data directly from the input buffer.
-      SIMD_Compress(state, data, 0);
-      databitlen -= bs;
-      data += bs/8;
-      IncreaseCounter(state, bs);
-    }
-    else
-    {
-       // Copy a chunk of data to the buffer
-      unsigned int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
-        IncreaseCounter( state, databitlen );
-        break;
-      }
-      else
-      {
-        memcpy( state->buffer+current/8, data, len/8 );
-        IncreaseCounter( state,len );
-        databitlen -= len;
-        data += len/8;
-        current = 0;
-        SIMD_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-
-  current = state->count & (state->blocksize - 1);
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    // We first need to zero out the end of the buffer.
-    if ( current & 7 )
-    {
-      BitSequence mask = 0xff >> ( current & 7 );
-      state->buffer[current/8] &= ~mask;
-    }
-    current = ( current+7 ) / 8;
-    memset( state->buffer+current, 0, state->blocksize/8 - current );
-    SIMD_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, state->blocksize / 8 );
-  l = state->count;
-  for ( i=0; i<8; i++ )
-  {
-    state->buffer[i] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-
-  SIMD_Compress( state, state->buffer, isshort );
-
-  // Decode the 32-bit words into a BitSequence
-  for ( i=0; i < 2*state->n_feistels; i++ )
-  {
-    u32 x = state->A[i];
-    out[4*i  ] = x & 0xff;
-    x >>= 8;
-    out[4*i+1] = x & 0xff;
-    x >>= 8;
-    out[4*i+2] = x & 0xff;
-    x >>= 8;
-    out[4*i+3] = x & 0xff;
-  }
-
-  memcpy( hashval, out, state->hashbitlen / 8 );
-  if ( state->hashbitlen % 8 )
-  {
-    BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
-    hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
-  }
-  return SUCCESS;
+/*HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
+                BitSequence *hashval) {
+  hashState_sd s;
+  HashReturn r;
+  r = Init(&s, hashbitlen);
+  if (r != SUCCESS)
+    return r;
+  r = Update(&s, data, databitlen);
+  if (r != SUCCESS)
+    return r;
+  r = Final(&s, hashval);
+  return r;
 }
-
+*/
--- a/algo/simd/nist.h
+++ b/algo/simd/nist.h
@@ -47,8 +47,8 @@ HashReturn final_sd(hashState_sd *state, BitSequence *hashval);
 HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval,
                            const BitSequence *data, DataLength databitlen );

-int simd_full( hashState_sd *state, BitSequence *hashval,
-               const BitSequence *data, DataLength databitlen );
+//HashReturn Hash(int hashbitlen, const BitSequence *data, DataLength databitlen,
+//                BitSequence *hashval);

 /* 
 * Internal API
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -1173,91 +1173,6 @@ int simd_4way_update_close( simd_4way_context *state, void *hashval,
  return 0;
 }

-int simd512_4way_full( simd_4way_context *state, void *hashval,
-                    const void *data, int datalen )
-{
-  __m512i *A = (__m512i*)state->A;
-
-  state->hashbitlen = 512;
-  state->n_feistels = 8;
-  state->blocksize = 128*8;
-  state->count = 0;
-
-  for ( int i = 0; i < 8; i++ )
-       A[i] = _mm512_set4_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
-                                 SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
-
-  int current, i;
-  int bs = state->blocksize;  // bits in one lane
-  int isshort = 1;
-  uint64_t l;
-  int databitlen = datalen * 8;
-
-  current = state->count & (bs - 1);
-
-  while ( databitlen > 0 )
-  {
-    if ( current == 0 && databitlen >= bs )
-    {
-      // We can hash the data directly from the input buffer.
-      SIMD_4way_Compress( state, data, 0 );
-      databitlen -= bs;
-      data += 4*( bs/8 );
-      state->count += bs;
-    }
-    else
-    {
-      // Copy a chunk of data to the buffer
-      int len = bs - current;
-      if ( databitlen < len )
-      {
-        memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen)/8 ) );
-        state->count += databitlen;
-        break;
-      }
-      else
-      {
-        memcpy( state->buffer + 4*(current/8), data, 4*(len/8) );
-        state->count += len;
-        databitlen -= len;
-        data += 4*( len/8 );
-        current = 0;
-        SIMD_4way_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-
-  current = state->count & (state->blocksize - 1);
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    current = current / 8;
-    memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current) );
-    SIMD_4way_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, 4*( state->blocksize/8 ) );
-  l = state->count;
-  for ( i = 0; i < 8; i++ )
-  {
-    state->buffer[ i    ] = l & 0xff;
-    state->buffer[ i+16 ] = l & 0xff;
-    state->buffer[ i+32 ] = l & 0xff;
-    state->buffer[ i+48 ] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-
-  SIMD_4way_Compress( state, state->buffer, isshort );
-  memcpy( hashval, state->A, 4*( state->hashbitlen / 8 ) );
-  return 0;
-}
-
-
-
 #endif // AVX512

 ////////////////////////////////////
@@ -2014,90 +1929,4 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
  return 0;
 }

-int simd512_2way_full( simd_2way_context *state, void *hashval,
-                    const void *data, int datalen )
-{
-  __m256i *A = (__m256i*)state->A;
-
-  state->hashbitlen = 512;
-  state->n_feistels = 8;
-  state->blocksize = 128*8;
-  state->count = 0;
-
-  for ( int i = 0; i < 8; i++ )
-       A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
-                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
-                                SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
-                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
-
-  int current, i;
-  int bs = state->blocksize;  // bits in one lane
-  int isshort = 1;
-  uint64_t l;
-  int databitlen = datalen * 8;
-
-  current = state->count & (bs - 1);
-
-  while ( databitlen > 0 )
-  {
-    if ( current == 0 && databitlen >= bs )
-    {
-      // We can hash the data directly from the input buffer.
-      SIMD_2way_Compress( state, data, 0 );
-
-      databitlen -= bs;
-      data += 2*( bs/8 );
-      state->count += bs;
-    }
-    else
-    {
-      // Copy a chunk of data to the buffer
-      int len = bs - current;
-      if ( databitlen < len )
-      {
-
-         memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
-        state->count += databitlen;
-        break;
-      }
-      else
-      {
-        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
-        state->count += len;
-        databitlen -= len;
-        data += 2*( len/8 );
-        current = 0;
-        SIMD_2way_Compress( state, state->buffer, 0 );
-      }
-    }
-  }
-
-  current = state->count & (state->blocksize - 1);
-
-  // If there is still some data in the buffer, hash it
-  if ( current )
-  {
-    current = ( current+7 ) / 8;
-    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
-    SIMD_2way_Compress( state, state->buffer, 0 );
-  }
-
-  //* Input the message length as the last block
-  memset( state->buffer, 0, 2*( state->blocksize/8 ) );
-  l = state->count;
-  for ( i = 0; i < 8; i++ )
-  {
-    state->buffer[ i    ] = l & 0xff;
-    state->buffer[ i+16 ] = l & 0xff;
-    l >>= 8;
-  }
-  if ( state->count < 16384 )
-    isshort = 2;
-
-  SIMD_2way_Compress( state, state->buffer, isshort );
-  memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
-  return 0;
-}
-
-
 #endif
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -26,8 +26,6 @@ int simd_4way_update( simd_4way_context *state, const void *data,
 int simd_4way_close( simd_4way_context *state, void *hashval );
 int simd_4way_update_close( simd_4way_context *state, void *hashval,
                            const void *data, int databitlen );
-int simd512_4way_full( simd_4way_context *state, void *hashval,
-                    const void *data, int datalen );

 #endif

@@ -47,8 +45,5 @@ int simd_2way_update( simd_2way_context *state, const void *data,
 int simd_2way_close( simd_2way_context *state, void *hashval );
 int simd_2way_update_close( simd_2way_context *state, void *hashval,
                            const void *data, int databitlen );
-int simd512_2way_full( simd_2way_context *state, void *hashval,
-                    const void *data, int datalen );
-
 #endif
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -45,18 +45,18 @@ extern "C"{
 #endif

 /*
-static const uint64_t IV256[] = {
-   0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
-   0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
-   0xEC06025E74DD7683, 0xE7A436CDC4746251,
-   0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
+static const sph_u64 IV256[] = {
+   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
 };

-static const uint64_t IV512[] = {
-   0x4903ADFF749C51CE, 0x0D95DE399746DF03,
-   0x8FD1934127C79BCE, 0x9A255629FF352CB1,
-   0x5DB62599DF6CA7B0, 0xEABE394CA9D5C3F4,
-   0x991112C71A75B523, 0xAE18A40B660FCC33
+static const sph_u64 IV512[] = {
+   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
 };
 */
   
@@ -372,7 +372,7 @@ do { \

 #define UBI_BIG_8WAY(etype, extra) \
 do { \
-  uint64_t t0, t1, t2; \
+  sph_u64 t0, t1, t2; \
  __m512i h8; \
  __m512i m0 =  buf[0]; \
  __m512i m1 =  buf[1]; \
@@ -391,8 +391,8 @@ do { \
  __m512i p5 = m5; \
  __m512i p6 = m6; \
  __m512i p7 = m7; \
-  t0 = (uint64_t)(bcount << 6) + (uint64_t)(extra); \
-  t1 = (bcount >> 58) + ((uint64_t)(etype) << 55); \
+  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
  TFBIG_KINIT_8WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
  TFBIG_8WAY_4e(0); \
  TFBIG_8WAY_4o(1); \
@@ -425,7 +425,7 @@ do { \

 #define DECL_STATE_BIG_8WAY \
  __m512i h0, h1, h2, h3, h4, h5, h6, h7; \
-  uint64_t bcount;
+  sph_u64 bcount;


 #endif // AVX512
@@ -488,7 +488,7 @@ do { \
 // scale buf offset by 4
 #define UBI_BIG_4WAY(etype, extra) \
 do { \
-  uint64_t t0, t1, t2; \
+  sph_u64 t0, t1, t2; \
  __m256i h8; \
  __m256i m0 =  buf[0]; \
  __m256i m1 =  buf[1]; \
@@ -507,8 +507,8 @@ do { \
  __m256i p5 = m5; \
  __m256i p6 = m6; \
  __m256i p7 = m7; \
-  t0 = (uint64_t)(bcount << 6) + (uint64_t)(extra); \
-  t1 = (bcount >> 58) + ((uint64_t)(etype) << 55); \
+  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
  TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
  TFBIG_4WAY_4e(0); \
  TFBIG_4WAY_4o(1); \
@@ -542,7 +542,7 @@ do { \

 #define DECL_STATE_BIG_4WAY \
  __m256i h0, h1, h2, h3, h4, h5, h6, h7; \
-  uint64_t bcount;
+  sph_u64 bcount;

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -727,7 +727,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
   {
       memcpy_256( buf + (ptr>>3), vdata, len>>3 );
       sc->ptr = ptr + len;
-       if ( ptr < buf_size ) return;
+       return;
   }

   READ_STATE_BIG( sc );
@@ -745,8 +745,6 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
       clen = buf_size - ptr;
       if ( clen > len )
            clen = len;
-       len -= clen;
-       if ( len == 0 ) break;
       memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
       ptr += clen;
       vdata += (clen>>3);
@@ -771,12 +769,9 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,

 	READ_STATE_BIG(sc);

-   if ( ptr )
-   {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
-	   et = 352 + ((bcount == 0) << 7);
-      UBI_BIG_4WAY( et, ptr );
-   }
+   memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+	et = 352 + ((bcount == 0) << 7);
+   UBI_BIG_4WAY( et, ptr );

   memset_zero_256( buf, buf_size >> 3 );
   bcount = 0;
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -48,8 +48,14 @@ extern "C"{
 #endif

 #include <stddef.h>
+#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

+// Output size in bits
+#define SPH_SIZE_skein256   256
+#define SPH_SIZE_skein512   512
+
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 typedef struct
@@ -57,11 +63,11 @@ typedef struct
   __m512i buf[8];
   __m512i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
-   uint64_t bcount;
-} skein_8way_big_context __attribute__ ((aligned (128)));
+   sph_u64 bcount;
+} sph_skein_8way_big_context __attribute__ ((aligned (128)));

-typedef skein_8way_big_context skein512_8way_context;
-typedef skein_8way_big_context skein256_8way_context;
+typedef sph_skein_8way_big_context skein512_8way_context;
+typedef sph_skein_8way_big_context skein256_8way_context;

 void skein512_8way_init( skein512_8way_context *sc );
 void skein512_8way_update( void *cc, const void *data, size_t len );
@@ -78,19 +84,21 @@ typedef struct
   __m256i buf[8];
   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
   size_t ptr;
-	uint64_t bcount;
-} skein_4way_big_context __attribute__ ((aligned (128)));
+	sph_u64 bcount;
+} sph_skein_4way_big_context __attribute__ ((aligned (128)));

-typedef skein_4way_big_context skein512_4way_context;
-typedef skein_4way_big_context skein256_4way_context;
+typedef sph_skein_4way_big_context skein512_4way_context;
+typedef sph_skein_4way_big_context skein256_4way_context;

 void skein512_4way_init( skein512_4way_context *sc );
 void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
+//#define skein512_4way skein512_4way_update

 void skein256_4way_init( skein256_4way_context *sc );
 void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
+//#define skein256_4way skein256_4way_update

 #ifdef __cplusplus
 }
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -17,6 +17,8 @@ static __thread jh512_8way_context ctx_mid;
 void tribus_hash_8way( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -42,8 +44,6 @@ void tribus_hash_8way( void *state, const void *input )
     keccak512_8way_close( &ctx_keccak, vhash );

 #if defined(__VAES__)
-     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
-     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
     
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
@@ -214,14 +214,14 @@ int scanhash_drop( struct work *work, uint32_t max_nonce,
 }

 void drop_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                        uint32_t* end_nonce_ptr )
+                        uint32_t* end_nonce_ptr, bool clean_job )
 {
   // ignore POK in first word
 // const int nonce_i = 19;
   const int wkcmp_sz = 72;  // (19-1) * sizeof(uint32_t)
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
   if ( memcmp( &work->data[1], &g_work->data[1], wkcmp_sz )
-       || ( *nonceptr >= *end_nonce_ptr ) )
+       && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) )
   {
      work_free( work );
      work_copy( work, g_work );
--- a/algo/x13/x13bcd-4way.c
+++ b/algo/x13/x13bcd-4way.c
@@ -527,7 +527,7 @@ int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

     blake512_4way_init( &x13bcd_ctx_mid );
-     blake512_4way_update( &x13bcd_ctx_mid, vdata, 64 );
+     blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
     do
     {
        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -227,7 +227,7 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

     blake512_4way_init( &x13sm3_ctx_mid );
-     blake512_4way_update( &x13sm3_ctx_mid, vdata, 64 );
+     blake512_4way( &x13sm3_ctx_mid, vdata, 64 );

     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -76,13 +76,10 @@ union _hex_context_overlay
 };
 typedef union _hex_context_overlay hex_context_overlay;

-static __thread hex_context_overlay hex_ctx;
-
 void hex_hash( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
   hex_context_overlay ctx;
-   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;
 /*
@@ -112,21 +109,23 @@ void hex_hash( void* output, const void* input )
         break;
         case GROESTL:
 #if defined(__AES__)
-            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                      (const char*)in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
            sph_groestl512_close(&ctx.groestl, hash);
 #endif
         break;
+         case SKEIN:
+            sph_skein512_init( &ctx.skein );
+            sph_skein512( &ctx.skein, in, size );
+            sph_skein512_close( &ctx.skein, hash );
+         break;
         case JH:
-            if ( i == 0 )
-               sph_jh512(&ctx.jh, in+64, 16 );
-            else
-            {   
-               sph_jh512_init( &ctx.jh );
-               sph_jh512(&ctx.jh, in, size );
-            }
+            sph_jh512_init( &ctx.jh );
+            sph_jh512(&ctx.jh, in, size );
            sph_jh512_close(&ctx.jh, hash );
         break;
         case KECCAK:
@@ -134,37 +133,15 @@ void hex_hash( void* output, const void* input )
            sph_keccak512( &ctx.keccak, in, size );
            sph_keccak512_close( &ctx.keccak, hash );
         break;
-         case SKEIN:
-            if ( i == 0 )
-               sph_skein512(&ctx.skein, in+64, 16 );
-            else
-            {
-               sph_skein512_init( &ctx.skein );
-               sph_skein512( &ctx.skein, in, size );
-            }
-            sph_skein512_close( &ctx.skein, hash );
-         break;
         case LUFFA:
-            if ( i == 0 )
+            init_luffa( &ctx.luffa, 512 );
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                          (const BitSequence*)in+64, 16 );
-            else
-            {
-               init_luffa( &ctx.luffa, 512 );
-               update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                             (const BitSequence*)in, size );
-            }
-            break;
+                                    (const BitSequence*)in, size );
+         break;
         case CUBEHASH:
-            if ( i == 0 )
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash,
-                                          (const byte*)in+64, 16 );
-            else
-            {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash,
-                                          (const byte*)in, size );
-            }
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                                  (const byte*)in, size );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
@@ -178,8 +155,9 @@ void hex_hash( void* output, const void* input )
         break;
         case ECHO:
 #if defined(__AES__)
-            echo_full( &ctx.echo, (BitSequence *)hash, 512,
-                              (const BitSequence *)in, size );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                                (const BitSequence*)in, size<<3 );
 #else
             sph_echo512_init( &ctx.echo );
             sph_echo512( &ctx.echo, in, size );
@@ -187,14 +165,9 @@ void hex_hash( void* output, const void* input )
 #endif
         break;
         case HAMSI:
-            if ( i == 0 ) 
-               sph_hamsi512( &ctx.hamsi, in+64, 16 );
-            else
-            {
-               sph_hamsi512_init( &ctx.hamsi );
-               sph_hamsi512( &ctx.hamsi, in, size );
-            }
-            sph_hamsi512_close( &ctx.hamsi, hash );
+             sph_hamsi512_init( &ctx.hamsi );
+             sph_hamsi512( &ctx.hamsi, in, size );
+             sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -202,24 +175,14 @@ void hex_hash( void* output, const void* input )
             sph_fugue512_close( &ctx.fugue, hash );
         break;
         case SHABAL:
-            if ( i == 0 ) 
-               sph_shabal512( &ctx.shabal, in+64, 16 );
-            else
-            {
-               sph_shabal512_init( &ctx.shabal );
-               sph_shabal512( &ctx.shabal, in, size );
-            }
-            sph_shabal512_close( &ctx.shabal, hash );
+             sph_shabal512_init( &ctx.shabal );
+             sph_shabal512( &ctx.shabal, in, size );
+             sph_shabal512_close( &ctx.shabal, hash );
         break;
         case WHIRLPOOL:
-            if ( i == 0 ) 
-                sph_whirlpool( &ctx.whirlpool, in+64, 16 );
-            else
-            {
-                sph_whirlpool_init( &ctx.whirlpool );
-                sph_whirlpool( &ctx.whirlpool, in, size );
-            }
-            sph_whirlpool_close( &ctx.whirlpool, hash );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash );
         break;
         case SHA_512:
             SHA512_Init( &ctx.sha512 );
@@ -238,77 +201,47 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) edata[20];
+   uint32_t _ALIGN(128) endiandata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
-   const int thr_id = mythr->id;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
-   const bool bench = opt_benchmark;
-   if ( bench )  ptarget[7] = 0x0cff;

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

   uint32_t ntime = swab32(pdata[17]);
   if ( s_ntime != ntime )
   {
-      hex_getAlgoString( (const uint32_t*) (&edata[1]), hashOrder );
+      hex_getAlgoString( (const uint32_t*) (&endiandata[1]), hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
   }

-   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-   switch ( algo )
-   {
-      case JH:
-         sph_jh512_init( &hex_ctx.jh );
-         sph_jh512( &hex_ctx.jh, edata, 64 );
-      break;
-      case SKEIN:
-         sph_skein512_init( &hex_ctx.skein );
-         sph_skein512( &hex_ctx.skein, edata, 64 );
-      break;
-      case CUBEHASH:
-         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &hex_ctx.cube, (const byte*)edata, 64 );
-      break;
-      case HAMSI:
-         sph_hamsi512_init( &hex_ctx.hamsi );
-         sph_hamsi512( &hex_ctx.hamsi, edata, 64 );
-      break;
-      case SHABAL:
-         sph_shabal512_init( &hex_ctx.shabal );
-         sph_shabal512( &hex_ctx.shabal, edata, 64 );
-      break;
-      case WHIRLPOOL:
-         sph_whirlpool_init( &hex_ctx.whirlpool );
-         sph_whirlpool( &hex_ctx.whirlpool, edata, 64 );
-      break;
-   }
-   
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
   do
   {
-      edata[19] = nonce;
-      hex_hash( hash32, edata );
+      be32enc( &endiandata[19], nonce );
+      hex_hash( hash32, endiandata );

-      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
+      if ( hash32[7] <= Htarg )
+      if (fulltest( hash32, ptarget ) && !opt_benchmark )
      {
-         be32enc( &pdata[19], nonce );
+         pdata[19] = nonce;
         submit_solution( work, hash32, mythr );
      }
      nonce++;
-   } while ( nonce < last_nonce && !(*restart) );
+   } while ( nonce < max_nonce && !(*restart) );
   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }
-
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -1,7 +1,5 @@
 #include "x16r-gate.h"

-void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;
-
 void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
   char *sptr = output;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -50,7 +50,7 @@ enum x16r_Algo {
        X16R_HASH_FUNC_COUNT
 };

-extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
+void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
 void x16r_getAlgoString( const uint8_t *prevblock, char *output );
 void x16s_getAlgoString( const uint8_t *prevblock, char *output );
 void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -72,19 +72,27 @@ void x17_8way_hash( void *state, const void *input )
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x17_8way_context_overlay ctx;

-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
+     // 1 Blake
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );

+     // 2 Bmw
     bmw512_8way_init( &ctx.bmw );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );

+     // 3 Groestl
+
 #if defined(__VAES__)

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, 64 );
-     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, 64 );
-     
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
     
 #else
@@ -92,44 +100,65 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

-     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );

 #endif

+     // 4 Skein parallel 4 way 64 bit 
     skein512_8way_init( &ctx.skein );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

+     // 5 JH
     jh512_8way_init( &ctx.jh );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );

+     // 6 Keccak
     keccak512_8way_init( &ctx.keccak );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, 64 );
-     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, 64 );
+     // 7 Luffa  
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );

-     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
-     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
+     // 8 Cubehash
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+     // 9 Shavite

 #if defined(__VAES__)

-     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );

 #else

@@ -166,13 +195,20 @@ void x17_8way_hash( void *state, const void *input )

 #endif

-     simd512_4way_full( &ctx.simd, vhashA, vhashA, 64 );
-     simd512_4way_full( &ctx.simd, vhashB, vhashB, 64 );
+     // 10 Simd
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+     // 11 Echo

 #if defined(__VAES__)

-     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
-     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

@@ -181,28 +217,37 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

-     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                     (const BitSequence *)hash0, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                     (const BitSequence *)hash1, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                     (const BitSequence *)hash2, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                     (const BitSequence *)hash3, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
-                     (const BitSequence *)hash4, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
-                     (const BitSequence *)hash5, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
-                     (const BitSequence *)hash6, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
-                     (const BitSequence *)hash7, 64 );
-     
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                            (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                            (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                            (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                            (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                            (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                            (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                            (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                            (const BitSequence *) hash7, 512 );
+
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
-
 #endif

+     // 12 Hamsi
+
     hamsi512_8way_init( &ctx.hamsi );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -210,6 +255,7 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

+     // 13 Fugue serial
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
@@ -235,6 +281,7 @@ void x17_8way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );

+     // 14 Shabal, parallel 8 way 32 bit
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

@@ -245,6 +292,7 @@ void x17_8way_hash( void *state, const void *input )
     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );

+     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
@@ -270,6 +318,7 @@ void x17_8way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash7 );

+     // 16 SHA512 parallel 64 bit 
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );

@@ -277,6 +326,7 @@ void x17_8way_hash( void *state, const void *input )
     sha512_8way_update( &ctx.sha512, vhash, 64 );
     sha512_8way_close( &ctx.sha512, vhash );

+     // 17 Haval parallel 32 bit
     rintrlv_8x64_8x32( vhashA, vhash,  512 );

     haval256_5_8way_init( &ctx.haval );
@@ -299,31 +349,28 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t Htarg = ptarget[7];
-   const bool bench = opt_benchmark;

   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   *noncev = mm512_intrlv_blend_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
      x17_8way_hash( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( ( hash7[ lane ] <= Htarg ) && !bench ) )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) ) )
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
-            pdata[19] = bswap_32( n + lane );
+            pdata[19] = n + lane;
            submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
-      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
+
   *hashes_done = n - first_nonce;
   return 0;
 }
@@ -354,7 +401,7 @@ typedef union _x17_4way_context_overlay x17_4way_context_overlay;

 void x17_4way_hash( void *state, const void *input )
 {
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -363,59 +410,91 @@ void x17_4way_hash( void *state, const void *input )
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     x17_4way_context_overlay ctx;

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     // 1 Blake parallel 4 way 64 bit
+     blake512_4way_init( &ctx.blake );
+     blake512_4way_update( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );

+     // 2 Bmw
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

+     // Serialize
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

-     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     // 3 Groestl
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

+     // Parallellize
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

+     // 4 Skein parallel 4 way 64 bit 
     skein512_4way_init( &ctx.skein );
     skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

+     // 5 JH
     jh512_4way_init( &ctx.jh );
     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

+     // 6 Keccak
     keccak512_4way_init( &ctx.keccak );
     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

+     // 7 Luffa  parallel 2 way 128 bit
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );

-     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
-     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );

-     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
-     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
+     // 8 Cubehash
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );

-     shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
-     shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );
+     // 9 Shavite
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_2way_init( &ctx.shavite );
+     shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );

-     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
-     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
+     // 10 Simd
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );

-     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                     (const BitSequence *)hash0, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                     (const BitSequence *)hash1, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                     (const BitSequence *)hash2, 64 );
-     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                     (const BitSequence *)hash3, 64 );
+     // 11 Echo serial
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );     
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );     
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );     
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );

+     // 12 Hamsi parallel 4 way 64 bit
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     hamsi512_4way_init( &ctx.hamsi );
@@ -424,6 +503,7 @@ void x17_4way_hash( void *state, const void *input )

     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );

+     // 13 Fugue serial
     sph_fugue512_init( &ctx.fugue );
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
@@ -437,6 +517,7 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

+     // 14 Shabal, parallel 4 way 32 bit
     intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );

     shabal512_4way_init( &ctx.shabal );
@@ -445,6 +526,7 @@ void x17_4way_hash( void *state, const void *input )

     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
       
+     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
@@ -458,12 +540,14 @@ void x17_4way_hash( void *state, const void *input )
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );

+     // 16 SHA512 parallel 64 bit 
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );

     sha512_4way_init( &ctx.sha512 );
     sha512_4way_update( &ctx.sha512, vhash, 64 );
     sha512_4way_close( &ctx.sha512, vhash );     

+     // 17 Haval parallel 32 bit
     rintrlv_4x64_4x32( vhashB, vhash,  512 );

     haval256_5_4way_init( &ctx.haval );
@@ -474,8 +558,8 @@ void x17_4way_hash( void *state, const void *input )
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[16*4] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
@@ -486,30 +570,27 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t Htarg = ptarget[7];
-   const bool bench = opt_benchmark;

   mm256_bswap32_intrlv80_4x64( vdata, pdata );
-   *noncev = mm256_intrlv_blend_32(
-                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
   do
   {
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
      x17_4way_hash( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash7[ lane ] <= Htarg && !bench ) )
-      {  
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
         extr_lane_4x32( lane_hash, hash, lane, 256 );
-         if ( valid_hash( lane_hash, ptarget ) )
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
-            pdata[19] = bswap_32( n + lane );
+            pdata[19] = n + lane;
            submit_lane_solution( work, lane_hash, mythr, lane );
-         }            
+         }
      }
-      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
-   pdata[19] = n;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -71,7 +71,9 @@ void x17_hash(void *output, const void *input)
    sph_bmw512_close(&ctx.bmw, hash);

 #if defined(__AES__)
-    groestl512_full( &ctx.groestl, (char*)hash, (const char*)hash, 512 );
+    init_groestl( &ctx.groestl, 64 );
+    update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                      (const char*)hash, 512 );
 #else
    sph_groestl512_init( &ctx.groestl );
    sph_groestl512( &ctx.groestl, hash, 64 );
@@ -90,11 +92,14 @@ void x17_hash(void *output, const void *input)
    sph_keccak512(&ctx.keccak, (const void*) hash, 64);
    sph_keccak512_close(&ctx.keccak, hash);

-    luffa_full( &ctx.luffa, (BitSequence*)hash, 512,
-                            (const BitSequence*)hash, 64 );
+    init_luffa( &ctx.luffa, 512 );
+    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                  (const BitSequence*)hash, 64 );

    // 8 Cube
-    cubehash_full( &ctx.cube, (byte*) hash, 512, (const byte*)hash, 64 );
+    cubehashInit( &ctx.cube, 512, 16, 32 );
+    cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                                (const byte*)hash, 64 );

    // 9 Shavite
    sph_shavite512_init( &ctx.shavite );
@@ -102,13 +107,15 @@ void x17_hash(void *output, const void *input)
    sph_shavite512_close( &ctx.shavite, hash);

    // 10 Simd
-    simd_full( &ctx.simd, (BitSequence*)hash,
+    init_sd( &ctx.simd, 512 );
+    update_final_sd( &ctx.simd, (BitSequence*)hash,
                          (const BitSequence*)hash, 512 );

    //11---echo---
 #if defined(__AES__)
-    echo_full( &ctx.echo, (BitSequence *)hash, 512,
-                    (const BitSequence *)hash, 64 );
+    init_echo( &ctx.echo, 512 );
+    update_final_echo ( &ctx.echo, (BitSequence*)hash,
+                             (const BitSequence*)hash, 512 );
 #else
    sph_echo512_init( &ctx.echo );
    sph_echo512( &ctx.echo, hash, 64 );
@@ -154,8 +161,28 @@ int scanhash_x17( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19] - 1;
   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
   int thr_id = mythr->id;  // thr_id arg is deprecated

+   uint64_t htmax[] =
+   {
+	0,
+	0xF,
+	0xFF,
+	0xFFF,
+	0xFFFF,
+	0x10000000
+   };
+   uint32_t masks[] =
+   {
+	0xFFFFFFFF,
+	0xFFFFFFF0,
+	0xFFFFFF00,
+	0xFFFFF000,
+	0xFFFF0000,
+	0
+   };
+
   // we need bigendian data...
   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -163,14 +190,23 @@ int scanhash_x17( struct work *work, uint32_t max_nonce,
   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );

-   do
+   for ( int m = 0; m < 6; m++ )
   {
-      pdata[19] = ++n;
-      be32enc( &endiandata[19], n );
-      x17_hash( hash64, endiandata );
-      if unlikely( valid_hash( hash64, ptarget ) && !opt_benchmark )
-             submit_solution( work, hash64, mythr );
-   } while ( n < max_nonce && !work_restart[thr_id].restart);
+      if ( Htarg <= htmax[m] )
+	   {
+	      uint32_t mask = masks[m];
+	      do
+	      {
+	         pdata[19] = ++n;
+		      be32enc( &endiandata[19], n );
+		      x17_hash( hash64, endiandata );
+		      if ( !( hash64[7] & mask ) )
+            if ( fulltest( hash64, ptarget ) && !opt_benchmark )
+                submit_solution( work, hash64, mythr );
+	      } while ( n < max_nonce && !work_restart[thr_id].restart);
+	      break;
+	   }
+   }
   *hashes_done = n - first_nonce + 1;
   pdata[19] = n;
   return 0;
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -11,8 +11,9 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -73,7 +74,9 @@ void xevan_8way_hash( void *output, const void *input )
     const int dataLen = 128;
     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));

-     blake512_8way_full( &ctx.blake, vhash, input, 80 );
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
     memset( &vhash[8<<3], 0, 64<<3 );

     bmw512_8way_init( &ctx.bmw );
@@ -84,8 +87,10 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, dataLen );
-     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, dataLen );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -94,14 +99,30 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -122,16 +143,22 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
-     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

 #if defined(__VAES__)

-     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, dataLen );
-     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, dataLen );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );

 #else

@@ -168,13 +195,17 @@ void xevan_8way_hash( void *output, const void *input )

 #endif

-     simd512_4way_full( &ctx.simd, vhashA, vhashA, dataLen );
-     simd512_4way_full( &ctx.simd, vhashB, vhashB, dataLen );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );

 #if defined(__VAES__)

-     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
-     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -183,23 +214,31 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );

-     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                     (const BitSequence *)hash0, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                     (const BitSequence *)hash1, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                     (const BitSequence *)hash2, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                     (const BitSequence *)hash3, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
-                     (const BitSequence *)hash4, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
-                     (const BitSequence *)hash5, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
-                     (const BitSequence *)hash6, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
-                     (const BitSequence *)hash7, dataLen );
-     
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );
+
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );

@@ -289,7 +328,9 @@ void xevan_8way_hash( void *output, const void *input )

     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );

-     blake512_8way_full( &ctx.blake, vhash, vhash, dataLen );
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, vhash, dataLen );
+     blake512_8way_close(&ctx.blake, vhash);

     bmw512_8way_init( &ctx.bmw );
     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
@@ -299,8 +340,10 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     groestl512_4way_full( &ctx.groestl, vhashA, vhashA, dataLen );
-     groestl512_4way_full( &ctx.groestl, vhashB, vhashB, dataLen );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -309,14 +352,30 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, dataLen<<3 );

-     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash4, (char*)hash4, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash5, (char*)hash5, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash6, (char*)hash6, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash7, (char*)hash7, dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -337,16 +396,22 @@ void xevan_8way_hash( void *output, const void *input )

     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa512_4way_full( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa512_4way_full( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_4way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
-     cube_4way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

 #if defined(__VAES__)

-     shavite512_4way_full( &ctx.shavite, vhashA, vhashA, dataLen );
-     shavite512_4way_full( &ctx.shavite, vhashB, vhashB, dataLen );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );

 #else

@@ -383,13 +448,17 @@ void xevan_8way_hash( void *output, const void *input )

 #endif

-     simd512_4way_full( &ctx.simd, vhashA, vhashA, dataLen );
-     simd512_4way_full( &ctx.simd, vhashB, vhashB, dataLen );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );

 #if defined(__VAES__)

-     echo_4way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
-     echo_4way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 );

     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );

@@ -398,22 +467,30 @@ void xevan_8way_hash( void *output, const void *input )
     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );

-     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                     (const BitSequence *)hash0, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                     (const BitSequence *)hash1, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                     (const BitSequence *)hash2, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                     (const BitSequence *)hash3, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash4, 512,
-                     (const BitSequence *)hash4, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash5, 512,
-                     (const BitSequence *)hash5, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash6, 512,
-                     (const BitSequence *)hash6, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash7, 512,
-                     (const BitSequence *)hash7, dataLen );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );

     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7, dataLen<<3 );
@@ -580,7 +657,9 @@ void xevan_4way_hash( void *output, const void *input )

     // parallel 4 way

-     blake512_4way_full( &ctx.blake, vhash, input, 80 );
+     blake512_4way_init( &ctx.blake );
+     blake512_4way_update( &ctx.blake, input, 80 );
+     blake512_4way_close(&ctx.blake, vhash);
     memset( &vhash[8<<2], 0, 64<<2 );

     bmw512_4way_init( &ctx.bmw );
@@ -590,10 +669,18 @@ void xevan_4way_hash( void *output, const void *input )
     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );

     // Parallel 4way
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -612,11 +699,15 @@ void xevan_4way_hash( void *output, const void *input )

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
-     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
@@ -631,15 +722,18 @@ void xevan_4way_hash( void *output, const void *input )
     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );

-     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                     (const BitSequence *)hash0, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                     (const BitSequence *)hash1, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                     (const BitSequence *)hash2, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                     (const BitSequence *)hash3, dataLen );
-
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
     // Parallel
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -711,10 +805,18 @@ void xevan_4way_hash( void *output, const void *input )

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

-     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
-     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -732,11 +834,15 @@ void xevan_4way_hash( void *output, const void *input )

     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );

-     luffa512_2way_full( &ctx.luffa, vhashA, vhashA, dataLen );
-     luffa512_2way_full( &ctx.luffa, vhashB, vhashB, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );

-     cube_2way_full( &ctx.cube, vhashA, 512, vhashA, dataLen );
-     cube_2way_full( &ctx.cube, vhashB, 512, vhashB, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_2way_init( &ctx.cube, 512, 16, 32 );
+     cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );

     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
@@ -751,14 +857,18 @@ void xevan_4way_hash( void *output, const void *input )
     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );

-     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                     (const BitSequence *)hash0, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                     (const BitSequence *)hash1, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                     (const BitSequence *)hash2, dataLen );
-     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                     (const BitSequence *)hash3, dataLen );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );

     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

@@ -824,7 +934,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned

   const uint32_t Htarg = ptarget[7];
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -167,10 +167,10 @@ void x22i_8way_hash( void *output, const void *input )

 #if defined(__VAES__)

-   shavite512_4way_init( &ctx.shavite );
-   shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
-   shavite512_4way_init( &ctx.shavite );
-   shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );

 #else

@@ -214,12 +214,12 @@ void x22i_8way_hash( void *output, const void *input )

 #if defined(__VAES__)

-   echo_4way_init( &ctx.echo, 512 );
-   echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
-   echo_4way_init( &ctx.echo, 512 );
-   echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );

-   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );

 #else

--- a/algo/yescrypt/sha256_Y.c
+++ b/algo/yescrypt/sha256_Y.c
@@ -0,0 +1,409 @@
+/*-
+ * Copyright 2005,2007,2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <stdint.h>
+#include <string.h>
+
+#include "sysendian.h"
+
+#include "sha256_Y.h"
+#include "compat.h"
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform_Y(uint32_t * state, const unsigned char block[64])
+{
+	uint32_t _ALIGN(128) W[64], S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	be32dec_vect(W, block, 64);
+	for (i = 16; i < 64; i++)
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+#if 0
+	/* Clean the stack. */
+	memset(W, 0, 256);
+	memset(S, 0, 32);
+	t0 = t1 = 0;
+#endif
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad_Y(SHA256_CTX_Y * ctx)
+{
+	unsigned char len[8];
+	uint32_t r, plen;
+
+	/*
+	 * Convert length to a vector of bytes -- we do this now rather
+	 * than later because the length will change after we pad.
+	 */
+	be32enc_vect(len, ctx->count, 8);
+
+	/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
+	r = (ctx->count[1] >> 3) & 0x3f;
+	plen = (r < 56) ? (56 - r) : (120 - r);
+	SHA256_Update_Y(ctx, PAD, (size_t)plen);
+
+	/* Add the terminating bit-count */
+	SHA256_Update_Y(ctx, len, 8);
+}
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+void
+SHA256_Init_Y(SHA256_CTX_Y * ctx)
+{
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+	uint32_t bitlen[2];
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint32_t)len) << 3;
+	bitlen[0] = (uint32_t)(len >> 29);
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+        SHA256_Transform_Y(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	while (len >= 64) {
+		SHA256_Transform_Y(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
+{
+	/* Add padding */
+	SHA256_Pad_Y(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, 32);
+
+	/* Clear the context state */
+	memset((void *)ctx, 0, sizeof(*ctx));
+}
+
+/* Initialize an HMAC-SHA256 operation with the given key. */
+void
+HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
+{
+	unsigned char pad[64];
+	unsigned char khash[32];
+	const unsigned char * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init(&ctx->ictx);
+		SHA256_Update(&ctx->ictx, K, Klen);
+		SHA256_Final(khash, &ctx->ictx);
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+        SHA256_Init(&ctx->ictx);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update(&ctx->ictx, pad, 64);
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init(&ctx->octx);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	SHA256_Update(&ctx->octx, pad, 64);
+
+	/* Clean the stack. */
+	//memset(khash, 0, 32);
+}
+
+/* Add bytes to the HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	SHA256_Update(&ctx->ictx, in, len);
+}
+
+/* Finish an HMAC-SHA256 operation. */
+void
+HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
+{
+	unsigned char ihash[32];
+
+	/* Finish the inner SHA256 operation. */
+	SHA256_Final(ihash, &ctx->ictx);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Update(&ctx->octx, ihash, 32);
+
+	/* Finish the outer SHA256 operation. */
+	SHA256_Final(digest, &ctx->octx);
+
+	/* Clean the stack. */
+	//memset(ihash, 0, 32);
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void
+PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX_Y PShctx, hctx;
+	uint8_t _ALIGN(128) T[32];
+	uint8_t _ALIGN(128) U[32];
+	uint8_t ivec[4];
+	size_t i, clen;
+	uint64_t j;
+	int k;
+
+	/* Compute HMAC state after processing P and S. */
+	HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
+		HMAC_SHA256_Update_Y(&hctx, ivec, 4);
+		HMAC_SHA256_Final_Y(U, &hctx);
+
+		/* T_i = U_1 ... */
+		memcpy(T, U, 32);
+
+		for (j = 2; j <= c; j++) {
+			/* Compute U_j. */
+			HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update_Y(&hctx, U, 32);
+			HMAC_SHA256_Final_Y(U, &hctx);
+
+			/* ... xor U_j ... */
+			for (k = 0; k < 32; k++)
+				T[k] ^= U[k];
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
+}
--- a/algo/yespower/yescrypt-r8g.h
+++ b/algo/yespower/yescrypt-r8g.h
@@ -1,6 +1,5 @@
 /*-
- * Copyright 2009 Colin Percival
- * Copyright 2013-2018 Alexander Peslyak
+ * Copyright 2005,2007,2009 Colin Percival
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -24,26 +23,47 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
+ * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $
 */
-#ifndef _YESPOWERR8G_H_
-#define _YESPOWERR8G_H_

+#ifndef _SHA256_H_
+#define _SHA256_H_
+
+#include <sys/types.h>
 #include <stdint.h>
-#include <stdlib.h> /* for size_t */
-#include "algo-gate-api.h"
-#include "algo/yespower/yespower.h"
+#include <openssl/sha.h>

-#ifdef __cplusplus
-extern "C" {
-#endif
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t count[2];
+	unsigned char buf[64];
+} SHA256_CTX_Y;

-extern int yespowerr8g_tls(const uint8_t *src, size_t srclen,
-    const yespower_params_t *params, yespower_binary_t *dst);
+/*
+typedef struct HMAC_SHA256Context {
+	SHA256_CTX_Y ictx;
+	SHA256_CTX_Y octx;
+} HMAC_SHA256_CTX_Y;
+*/

-#ifdef __cplusplus
-}
-#endif
+typedef struct HMAC_SHA256Context {
+        SHA256_CTX ictx;
+        SHA256_CTX octx;
+} HMAC_SHA256_CTX_Y;

-#endif /* !_YESPOWERR8G_H_ */
+void	SHA256_Init_Y(SHA256_CTX_Y *);
+void	SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
+void	SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
+void	HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
+void	HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void	PBKDF2_SHA256_Y(const uint8_t *, size_t, const uint8_t *, size_t,
+    uint64_t, uint8_t *, size_t);
+
+#endif /* !_SHA256_H_ */
--- a/algo/yescrypt/sysendian.h
+++ b/algo/yescrypt/sysendian.h
@@ -0,0 +1,124 @@
+/*-
+ * Copyright 2007-2009 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _SYSENDIAN_H_
+#define _SYSENDIAN_H_
+
+/* If we don't have be64enc, the <sys/endian.h> we have isn't usable. */
+#if !HAVE_DECL_BE64ENC
+#undef HAVE_SYS_ENDIAN_H
+#endif
+
+#ifdef HAVE_SYS_ENDIAN_H
+
+#include <sys/endian.h>
+
+#else
+
+#include <stdint.h>
+
+
+
+static __inline uint64_t
+be64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) +
+	    ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) +
+	    ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) +
+	    ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56));
+}
+
+static __inline void
+be64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[7] = x & 0xff;
+	p[6] = (x >> 8) & 0xff;
+	p[5] = (x >> 16) & 0xff;
+	p[4] = (x >> 24) & 0xff;
+	p[3] = (x >> 32) & 0xff;
+	p[2] = (x >> 40) & 0xff;
+	p[1] = (x >> 48) & 0xff;
+	p[0] = (x >> 56) & 0xff;
+}
+
+
+
+static __inline uint64_t
+le64dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) +
+	    ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) +
+	    ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) +
+	    ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56));
+}
+
+static __inline void
+le64enc(void *pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+	p[4] = (x >> 32) & 0xff;
+	p[5] = (x >> 40) & 0xff;
+	p[6] = (x >> 48) & 0xff;
+	p[7] = (x >> 56) & 0xff;
+}
+
+
+static __inline uint32_t
+be32dec(const void *pp)
+{
+	const uint8_t *p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+
+static __inline void
+be32enc(void *pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+#endif /* !HAVE_SYS_ENDIAN_H */
+
+#endif /* !_SYSENDIAN_H_ */
--- a/algo/yescrypt/yescrypt-simd.c
+++ b/algo/yescrypt/yescrypt-simd.c
@@ -48,7 +48,9 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/sha/hmac-sha256-hash.h"
+#include "sha256_Y.h"
+#include "sysendian.h"
+
 #include "yescrypt.h"
 #include "yescrypt-platform.h"

@@ -1310,7 +1312,7 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	}

 	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
-	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size);
+	PBKDF2_SHA256_Y(passwd, passwdlen, salt, saltlen, 1, B, B_size);

 	if (t || flags)
 		memcpy(sha256, B, sizeof(sha256));
@@ -1340,7 +1342,7 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	}

 	/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
-	PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen);
+	PBKDF2_SHA256_Y(passwd, passwdlen, B, B_size, 1, buf, buflen);

 	/*
 	 * Except when computing classic scrypt, allow all computation so far
@@ -1352,14 +1354,14 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
 	if ((t || flags) && buflen == sizeof(sha256)) {
 	   /* Compute ClientKey */
 	   {
-		HMAC_SHA256_CTX ctx;
-		HMAC_SHA256_Init(&ctx, buf, buflen);
+		HMAC_SHA256_CTX_Y ctx;
+		HMAC_SHA256_Init_Y(&ctx, buf, buflen);
                if ( yescrypt_client_key )
-                    HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
+                    HMAC_SHA256_Update_Y( &ctx, (uint8_t*)yescrypt_client_key,
                                        yescrypt_client_key_len );
                else
-                    HMAC_SHA256_Update( &ctx, salt, saltlen );
-		HMAC_SHA256_Final(sha256, &ctx);
+                    HMAC_SHA256_Update_Y( &ctx, salt, saltlen );
+		HMAC_SHA256_Final_Y(sha256, &ctx);
 	   }
 	   /* Compute StoredKey */
 	   {
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -25,7 +25,7 @@
 #include "compat.h"

 #include "yescrypt.h"
-#include "algo/sha/hmac-sha256-hash.h"
+#include "sha256_Y.h"
 #include "algo-gate-api.h"

 #define BYTES2CHARS(bytes) \
@@ -385,30 +385,35 @@ void yescrypthash(void *output, const void *input)
 int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) vhash[8];
-   uint32_t _ALIGN(64) endiandata[20];
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce;
-   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+        uint32_t _ALIGN(64) vhash[8];
+        uint32_t _ALIGN(64) endiandata[20];
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;

-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );
-   endiandata[19] = n;
-   do {
-      yescrypt_hash((char*) endiandata, (char*) vhash, 80);
-      if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark )
-      {
-          be32enc( pdata+19, n );
-          submit_solution( work, vhash, mythr );
-      }
-      endiandata[19] = ++n;
-   } while ( n < last_nonce && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce;
-   pdata[19] = n;
-   return 0;
+        const uint32_t Htarg = ptarget[7];
+        const uint32_t first_nonce = pdata[19];
+        uint32_t n = first_nonce;
+        int thr_id = mythr->id;  // thr_id arg is deprecated
+
+        for (int k = 0; k < 19; k++)
+                be32enc(&endiandata[k], pdata[k]);
+
+        do {
+           be32enc(&endiandata[19], n);
+           yescrypt_hash((char*) endiandata, (char*) vhash, 80);
+           if (vhash[7] <= Htarg && fulltest(vhash, ptarget ) 
+               && !opt_benchmark )
+           {
+               pdata[19] = n;
+               submit_solution( work, vhash, mythr );
+           }
+           n++;
+        } while (n < max_nonce && !work_restart[thr_id].restart);
+
+        *hashes_done = n - first_nonce + 1;
+        pdata[19] = n;
+
+        return 0;
 }

 void yescrypt_gate_base(algo_gate_t *gate )
--- a/algo/yespower/crypto/blake2b-yp.c
+++ b/algo/yespower/crypto/blake2b-yp.c
@@ -30,8 +30,9 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-#include "simd-utils.h"
+
 #include <algo/yespower/crypto/sph_types.h>
+#include <algo/yespower/utils/sysendian.h>
 #include "blake2b-yp.h"

 // Cyclic right rotation.
@@ -271,7 +272,7 @@ void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t *
 {
    hmac_yp_ctx PShctx, hctx;
    size_t i;
-    uint32_t ivec;
+    uint8_t ivec[4];
    uint8_t U[32];
    uint8_t T[32];
    uint64_t j;
@@ -285,11 +286,11 @@ void pbkdf2_blake2b_yp(const uint8_t * passwd, size_t passwdlen, const uint8_t *
    /* Iterate through the blocks. */
    for (i = 0; i * 32 < dkLen; i++) {
        /* Generate INT(i + 1). */
-        ivec = bswap_32( i+1 );
+        be32enc(ivec, (uint32_t)(i + 1));

        /* Compute U_1 = PRF(P, S || INT(i)). */
        memcpy(&hctx, &PShctx, sizeof(hmac_yp_ctx));
-        hmac_blake2b_yp_update(&hctx, &ivec, 4);
+        hmac_blake2b_yp_update(&hctx, ivec, 4);
        hmac_blake2b_yp_final(&hctx, U);

        /* T_i = U_1 ... */
--- a/algo/yespower/insecure_memzero.h
+++ b/algo/yespower/insecure_memzero.h
@@ -0,0 +1 @@
+#define insecure_memzero(buf, len) /* empty */
--- a/algo/sha/hmac-sha256-hash.c
+++ b/algo/sha/hmac-sha256-hash.c
@@ -28,10 +28,46 @@

 #include <stdint.h>
 #include <string.h>
-#include "simd-utils.h"
-#include "hmac-sha256-hash.h"
+
+#include "sysendian.h"
+
+#include "sha256_p.h"
 #include "compat.h"

+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+*/
 /**
 * SHA256_Buf(in, len, digest):
 * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
@@ -40,9 +76,9 @@ void
 SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 {
 	SHA256_CTX ctx;
-   SHA256_Init( &ctx );
-   SHA256_Update( &ctx, in, len );
-   SHA256_Final( digest, &ctx );
+        SHA256_Init( &ctx );
+        SHA256_Update( &ctx, in, len );
+        SHA256_Final( digest, &ctx );
 }

 /**
@@ -51,18 +87,19 @@ SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
 * length ${Klen}, and write the result to ${digest}.
 */
 void
-HMAC_SHA256_Buf( const void *K, size_t Klen, const void *in, size_t len,
-                 uint8_t digest[32])
+HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
+    uint8_t digest[32])
 {
-   HMAC_SHA256_CTX ctx;
-   HMAC_SHA256_Init( &ctx, K, Klen );
-   HMAC_SHA256_Update( &ctx, in, len );
-   HMAC_SHA256_Final( digest, &ctx );
+        HMAC_SHA256_CTX ctx;
+
+        HMAC_SHA256_Init( &ctx, K, Klen );
+        HMAC_SHA256_Update( &ctx, in, len );
+        HMAC_SHA256_Final( digest, &ctx );
 }

 /* Initialize an HMAC-SHA256 operation with the given key. */
 void
-HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
+HMAC_SHA256_Init( HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen )
 {
 	unsigned char pad[64];
 	unsigned char khash[32];
@@ -70,8 +107,7 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
 	size_t i;

 	/* If Klen > 64, the key is really SHA256(K). */
-	if ( Klen > 64 )
-   {
+	if (Klen > 64) {
 		SHA256_Init( &ctx->ictx );
 		SHA256_Update( &ctx->ictx, K, Klen );
 		SHA256_Final( khash, &ctx->ictx );
@@ -80,7 +116,7 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
 	}

 	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   SHA256_Init( &ctx->ictx );
+        SHA256_Init( &ctx->ictx );
 	memset( pad, 0x36, 64 );
 	for ( i = 0; i < Klen; i++ )
 		pad[i] ^= K[i];
@@ -92,19 +128,23 @@ HMAC_SHA256_Init( HMAC_SHA256_CTX *ctx, const void *_K, size_t Klen )
 	for ( i = 0; i < Klen; i++ )
 		pad[i] ^= K[i];
 	SHA256_Update( &ctx->octx, pad, 64 );
+
+	/* Clean the stack. */
+	//memset(khash, 0, 32);
 }

 /* Add bytes to the HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Update( HMAC_SHA256_CTX *ctx, const void *in, size_t len )
+HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
 {
+
 	/* Feed data to the inner SHA256 operation. */
 	SHA256_Update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
 void
-HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
+HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx )
 {
 	unsigned char ihash[32];

@@ -116,6 +156,9 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )

 	/* Finish the outer SHA256 operation. */
 	SHA256_Final( digest, &ctx->octx );
+
+	/* Clean the stack. */
+	//memset(ihash, 0, 32);
 }

 /**
@@ -124,51 +167,52 @@ HMAC_SHA256_Final( unsigned char digest[32], HMAC_SHA256_CTX *ctx )
 * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
 */
 void
-PBKDF2_SHA256( const uint8_t *passwd, size_t passwdlen, const uint8_t *salt,
-               size_t saltlen, uint64_t c, uint8_t *buf, size_t dkLen )
+PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
 {
 	HMAC_SHA256_CTX PShctx, hctx;
 	uint8_t _ALIGN(128) T[32];
 	uint8_t _ALIGN(128) U[32];
-   uint32_t ivec;
+	uint8_t ivec[4];
 	size_t i, clen;
 	uint64_t j;
 	int k;

 	/* Compute HMAC state after processing P and S. */
-	HMAC_SHA256_Init( &PShctx, passwd, passwdlen );
-	HMAC_SHA256_Update( &PShctx, salt, saltlen );
+	HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
+	HMAC_SHA256_Update(&PShctx, salt, saltlen);

 	/* Iterate through the blocks. */
-	for ( i = 0; i * 32 < dkLen; i++ )
-   {
+	for (i = 0; i * 32 < dkLen; i++) {
 		/* Generate INT(i + 1). */
-      ivec = bswap_32( i+1 );
+		be32enc(ivec, (uint32_t)(i + 1));

 		/* Compute U_1 = PRF(P, S || INT(i)). */
-		memcpy( &hctx, &PShctx, sizeof(HMAC_SHA256_CTX) );
-		HMAC_SHA256_Update( &hctx, &ivec, 4 );
-		HMAC_SHA256_Final( U, &hctx );
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
+		HMAC_SHA256_Update(&hctx, ivec, 4);
+		HMAC_SHA256_Final(U, &hctx);

 		/* T_i = U_1 ... */
-		memcpy( T, U, 32 );
+		memcpy(T, U, 32);

-		for ( j = 2; j <= c; j++ )
-      {
+		for (j = 2; j <= c; j++) {
 			/* Compute U_j. */
-			HMAC_SHA256_Init( &hctx, passwd, passwdlen );
-			HMAC_SHA256_Update( &hctx, U, 32 );
-			HMAC_SHA256_Final( U, &hctx );
+			HMAC_SHA256_Init(&hctx, passwd, passwdlen);
+			HMAC_SHA256_Update(&hctx, U, 32);
+			HMAC_SHA256_Final(U, &hctx);

 			/* ... xor U_j ... */
-			for ( k = 0; k < 32; k++ )
+			for (k = 0; k < 32; k++)
 				T[k] ^= U[k];
 		}

 		/* Copy as many bytes as necessary into buf. */
 		clen = dkLen - i * 32;
-		if ( clen > 32 )
+		if (clen > 32)
 			clen = 32;
-		memcpy( &buf[i * 32], T, clen );
+		memcpy(&buf[i * 32], T, clen);
 	}
+
+	/* Clean PShctx, since we never called _Final on it. */
+	//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
 }
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`#define insecure_memzero(buf, len) /* empty */`