diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 51ded93..380d835 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -35,13 +35,18 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------
 
+
+v3.10.7
+
+AVX512 for x25x, lbry, x13bcd (bcd).
+
 v3.10.6
 
 Added support for SSL stratum: stratum+tcps://
 
 Added job id reporting again, but leaner, suppressed with --quiet.
 
-AVX512 for x21s, x22i, lyra2z, allium
+AVX512 for x21s, x22i, lyra2z, allium.
 
 Fixed share overflow warnings mining lbry with Ryzen (SHA).
 
diff --git a/algo-gate-api.c b/algo-gate-api.c
index cebfc8b..f77ee29 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -317,6 +317,7 @@ const char* const algo_alias_map[][2] =
   { "argon2d-crds",      "argon2d250"   },
   { "argon2d-dyn",       "argon2d500"   },
   { "argon2d-uis",       "argon2d4096"  },
+  { "bcd",               "x13bcd"       },
   { "bitcore",           "timetravel10" },
   { "bitzeny",           "yescryptr8"   },
   { "blake256r8",        "blakecoin"    },
diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h
index 9f389f6..091a537 100644
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -104,7 +104,7 @@ typedef struct {
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
-#define blake256_8way blake256_8way_update
+//#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);
 
 // 14 rounds, blake, decred
diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c
index f958659..3de0363 100644
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -842,7 +842,8 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
 }
 
 static void
-blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
+blake32_4way( blake_4way_small_context *ctx, const void *data,
+              size_t len )
 {
    __m128i *buf = (__m128i*)ctx->buf;
    size_t  bptr = ctx->ptr<<2;
@@ -1237,7 +1238,7 @@ blake256_4way_init(void *ctx)
 }
 
 void
-blake256_4way(void *ctx, const void *data, size_t len)
+blake256_4way_update(void *ctx, const void *data, size_t len)
 {
 	blake32_4way(ctx, data, len);
 }
diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h
index 953841f..baf2865 100644
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -14,7 +14,6 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1
 
-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)
 
 #include "simd-utils.h"
@@ -132,6 +131,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
 }
 #endif
 
-#endif  // __SSE4_2__
+#endif  // __SSE2__
 
 #endif
diff --git a/algo/bmw/bmw512-4way.c b/algo/bmw/bmw512-4way.c
index 2757fdd..795be11 100644
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -41,7 +41,6 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
 
       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
-//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
       {
           extr_lane_8x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) )
@@ -66,7 +65,7 @@ void bmw512hash_4way(void *state, const void *input)
 {
     bmw512_4way_context ctx;
     bmw512_4way_init( &ctx );
-    bmw512_4way( &ctx, input, 80 );
+    bmw512_4way_update( &ctx, input, 80 );
     bmw512_4way_close( &ctx, state );
 }
 
diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c
index 17f0cf1..9a57177 100644
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -45,7 +45,7 @@ void myriad_4way_hash( void *output, const void *input )
 
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
 
-     sha256_4way( &ctx.sha, vhash, 64 );
+     sha256_4way_update( &ctx.sha, vhash, 64 );
      sha256_4way_close( &ctx.sha, output );
 }
 
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 0a1e6e2..d86bd42 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -1171,7 +1171,8 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
    sc->h[7] = m256_const1_64( 0x6769756d2042656c );
 }
 
-void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
+void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
+      size_t len )
 {
    __m256i *vdata = (__m256i*)data;
 
diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h
index 4e57f10..60e33b2 100644
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -62,7 +62,7 @@ typedef hamsi_4way_big_context hamsi512_4way_context;
 void hamsi512_4way_init( hamsi512_4way_context *sc );
 void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
       size_t len );
-#define hamsi512_4way hamsi512_4way_update
+//#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c
index c9e7ad8..313b23f 100644
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -38,7 +38,7 @@
 #define SPH_XCAT_(a, b)   a ## b
 
 static void
-SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
 ( haval_4way_context *sc, const void *data, size_t len )
 {
    __m128i *vdata = (__m128i*)data;
diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c
index 02df40f..6b45e10 100644
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \
 } \
  \
 void \
-haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
+haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \
 { \
-	haval ## y ## _4way(cc, data, len); \
+	haval ## y ## _4way_update(cc, data, len); \
 } \
  \
 void \
diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h
index 9bd37ba..9164d2f 100644
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -85,7 +85,7 @@ typedef haval_4way_context haval256_5_4way_context;
 void haval256_5_4way_init( void *cc );
 
 void haval256_5_4way_update( void *cc, const void *data, size_t len );
-#define haval256_5_4way haval256_5_4way_update
+//#define haval256_5_4way haval256_5_4way_update
 
 void haval256_5_4way_close( void *cc, void *dst );
 
diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h
index 5cccebd..562fd5e 100644
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -103,14 +103,12 @@ typedef jh_4way_context jh512_4way_context;
 void jh256_4way_init( jh_4way_context *sc);
 
 void jh256_4way_update(void *cc, const void *data, size_t len);
-#define jh256_4way jh256_4way_update
 
 void jh256_4way_close(void *cc, void *dst);
 
 void jh512_4way_init( jh_4way_context *sc );
 
 void jh512_4way_update(void *cc, const void *data, size_t len);
-#define jh512_4way jh512_4way_update
 
 void jh512_4way_close(void *cc, void *dst);
 
diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c
index 2c76a33..68ffe7f 100644
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -33,7 +33,7 @@ void jha_hash_4way( void *out, const void *input )
     keccak512_4way_context ctx_keccak;
 
     keccak512_4way_init( &ctx_keccak );
-    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_update( &ctx_keccak, input, 80 );
     keccak512_4way_close( &ctx_keccak, vhash );
 
     // Heavy & Light Pair Loop
@@ -58,7 +58,7 @@ void jha_hash_4way( void *out, const void *input )
        intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
        skein512_4way_init( &ctx_skein );
-       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_update( &ctx_skein, vhash, 64 );
        skein512_4way_close( &ctx_skein, vhashB );
 
        for ( int i = 0; i < 8; i++ )
@@ -69,7 +69,7 @@ void jha_hash_4way( void *out, const void *input )
        blake512_4way_close( &ctx_blake, vhashA );
 
        jh512_4way_init( &ctx_jh );
-       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_update( &ctx_jh, vhash, 64 );
        jh512_4way_close( &ctx_jh, vhashB );
 
        for ( int i = 0; i < 8; i++ )
diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h
index d8500a6..a353856 100644
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -99,14 +99,12 @@ typedef keccak64_ctx_m256i keccak512_4way_context;
 void keccak256_4way_init(void *cc);
 void keccak256_4way_update(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
-#define keccak256_4way keccak256_4way_update
 
 void keccak512_4way_init(void *cc);
 void keccak512_4way_update(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
         void *cc, unsigned ub, unsigned n, void *dst);
-#define keccak512_4way keccak512_4way_update
 
 #endif
 
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index a02d0f1..c06f813 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -55,7 +55,6 @@ void allium_8way_hash( void *state, const void *input )
    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 256 );
 
-
    intrlv_2x256( vhash, hash0, hash1, 256 );
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash0, hash1, vhash, 256 );
@@ -69,19 +68,6 @@ void allium_8way_hash( void *state, const void *input )
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash6, hash7, vhash, 256 );
   
-/* 
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
-   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
-   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
-   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
-*/
-
-
-
    intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
    intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
 
@@ -105,20 +91,6 @@ void allium_8way_hash( void *state, const void *input )
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash6, hash7, vhash, 256 );
 
-
-/*
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
-   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
-   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
-   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
-*/
-
-
-
    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                 hash7, 256 );
 
@@ -232,11 +204,11 @@ void allium_4way_hash( void *state, const void *input )
    allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
 
    memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
    blake256_4way_close( &ctx.blake, vhash32 );
 
    rintrlv_4x32_4x64( vhash64, vhash32, 256 );
-   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
    keccak256_4way_close( &ctx.keccak, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -261,7 +233,7 @@ void allium_4way_hash( void *state, const void *input )
 
    intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
 
-   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_update( &ctx.skein, vhash64, 32 );
    skein256_4way_close( &ctx.skein, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c
index a76e68c..b86f514 100644
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -20,7 +20,7 @@ static __thread blake256_4way_context l2h_4way_blake_mid;
 void lyra2h_4way_midstate( const void* input )
 {
        blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
 }
 
 void lyra2h_4way_hash( void *state, const void *input )
diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c
index f2954c3..0ed53c5 100644
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -44,7 +44,7 @@ void lyra2rev2_8way_hash( void *state, const void *input )
    lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
 
-   blake256_8way( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
    blake256_8way_close( &ctx.blake, vhash );
 
    rintrlv_8x32_8x64( vhashA, vhash, 256 );
@@ -176,12 +176,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
    lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
    memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
 
-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
    blake256_4way_close( &ctx.blake, vhash );
 
    rintrlv_4x32_4x64( vhash64, vhash, 256 );
 
-   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
    keccak256_4way_close( &ctx.keccak, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -201,7 +201,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
 
    intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
 
-   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_update( &ctx.skein, vhash64, 32 );
    skein256_4way_close( &ctx.skein, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -217,7 +217,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
 
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
 
-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
    bmw256_4way_close( &ctx.bmw, state );
 }
 
@@ -242,7 +242,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
 
    blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
 
    do
    {
diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c
index 6e560be..a7a9a3c 100644
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -209,7 +209,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
    lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
 
-   blake256_8way( &ctx.blake, input + (64*8), 16 );
+   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
    blake256_8way_close( &ctx.blake, vhash );
 
    dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -252,7 +252,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
    intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                              hash4, hash5, hash6, hash7, 256 );
 
-   bmw256_8way( &ctx.bmw, vhash, 32 );
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
    bmw256_8way_close( &ctx.bmw, state );
 
    }
@@ -277,7 +277,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
 
    blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
 
    do
    {
@@ -334,8 +334,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
    lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
    memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
 
-//   blake256_4way( &ctx.blake, input, 80 );
-   blake256_4way( &ctx.blake, input + (64*4), 16 );
+   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
    blake256_4way_close( &ctx.blake, vhash );
    dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
 
@@ -358,7 +357,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
    LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
 
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
    bmw256_4way_close( &ctx.bmw, state );
 }
 
@@ -383,7 +382,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
 
    blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
 
    do
    {
diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c
index 3f5e56e..7273ebe 100644
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -149,7 +149,7 @@ static __thread blake256_8way_context l2z_8way_blake_mid;
 void lyra2z_8way_midstate( const void* input )
 {
        blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way( &l2z_8way_blake_mid, input, 64 );
+       blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
 }
 
 void lyra2z_8way_hash( void *state, const void *input )
@@ -166,7 +166,7 @@ void lyra2z_8way_hash( void *state, const void *input )
      blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
 
      memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_update( &ctx_blake, input + (64*8), 16 );
      blake256_8way_close( &ctx_blake, vhash );
 
      dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -247,7 +247,7 @@ static __thread blake256_4way_context l2z_4way_blake_mid;
 void lyra2z_4way_midstate( const void* input )
 {
        blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way( &l2z_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
 }
 
 void lyra2z_4way_hash( void *state, const void *input )
@@ -260,7 +260,7 @@ void lyra2z_4way_hash( void *state, const void *input )
      blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
 
      memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
      blake256_4way_close( &ctx_blake, vhash );
 
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
diff --git a/algo/nist5/nist5-4way.c b/algo/nist5/nist5-4way.c
index c4aa73d..9b8687b 100644
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -133,7 +133,7 @@ void nist5hash_4way( void *out, const void *input )
      keccak512_4way_context ctx_keccak;
 
      blake512_4way_init( &ctx_blake );
-     blake512_4way( &ctx_blake, input, 80 );
+     blake512_4way_update( &ctx_blake, input, 80 );
      blake512_4way_close( &ctx_blake, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -154,15 +154,15 @@ void nist5hash_4way( void *out, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      jh512_4way_init( &ctx_jh );
-     jh512_4way( &ctx_jh, vhash, 64 );
+     jh512_4way_update( &ctx_jh, vhash, 64 );
      jh512_4way_close( &ctx_jh, vhash );
 
      keccak512_4way_init( &ctx_keccak );
-     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_update( &ctx_keccak, vhash, 64 );
      keccak512_4way_close( &ctx_keccak, vhash );
 
      skein512_4way_init( &ctx_skein );
-     skein512_4way( &ctx_skein, vhash, 64 );
+     skein512_4way_update( &ctx_skein, vhash, 64 );
      skein512_4way_close( &ctx_skein, out );
 }
 
diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c
index 2c5d561..be1c19f 100644
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -54,10 +54,10 @@ void anime_4way_hash( void *state, const void *input )
     anime_4way_ctx_holder ctx;
     memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
 
-    bmw512_4way( &ctx.bmw, input, 80 );
+    bmw512_4way_update( &ctx.bmw, input, 80 );
     bmw512_4way_close( &ctx.bmw, vhash );
 
-    blake512_4way( &ctx.blake, vhash, 64 );
+    blake512_4way_update( &ctx.blake, vhash, 64 );
     blake512_4way_close( &ctx.blake, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -92,7 +92,7 @@ void anime_4way_hash( void *state, const void *input )
 
     if ( mm256_anybits0( vh_mask ) )
     {
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
        skein512_4way_close( &ctx.skein, vhashB );
     }
 
@@ -111,7 +111,7 @@ void anime_4way_hash( void *state, const void *input )
 
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -119,23 +119,23 @@ void anime_4way_hash( void *state, const void *input )
     if ( mm256_anybits1( vh_mask ) )
     {
        blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
        blake512_4way_close( &ctx.blake, vhashA );
     }
     if ( mm256_anybits0( vh_mask ) )
     {
        bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
     }
 
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 
     skein512_4way_init( &ctx.skein );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -143,13 +143,13 @@ void anime_4way_hash( void *state, const void *input )
     if ( mm256_anybits1( vh_mask ) )
     {
        keccak512_4way_init( &ctx.keccak );
-       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_update( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
     }
     if ( mm256_anybits0( vh_mask ) )
     {
        jh512_4way_init( &ctx.jh );
-       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_update( &ctx.jh, vhash, 64 );
        jh512_4way_close( &ctx.jh, vhashB );
     }
 
diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c
index 9f22d29..c4e9033 100644
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -830,7 +830,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      __m256i* vhB = (__m256i*)vhashB;
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, input, 80 );
+     bmw512_4way_update( &ctx.bmw, input, 80 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -889,18 +889,18 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits1( vh_mask ) )
      {
        skein512_4way_init( &ctx.skein );
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
        skein512_4way_close( &ctx.skein, vhashB );
      }
 
      mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
 // second fork, A = blake parallel, B= bmw parallel.
@@ -911,14 +911,14 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits0( vh_mask ) )
      {
        blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
        blake512_4way_close( &ctx.blake, vhashA );
      }
 
      if ( mm256_anybits1( vh_mask ) )
      {
        bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
      }
 
@@ -962,14 +962,14 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits0( vh_mask ) )
      {
         keccak512_4way_init( &ctx.keccak );
-        keccak512_4way( &ctx.keccak, vhash, 64 );
+        keccak512_4way_update( &ctx.keccak, vhash, 64 );
         keccak512_4way_close( &ctx.keccak, vhashA );
      }
 
      if ( mm256_anybits1( vh_mask ) )
      {
         jh512_4way_init( &ctx.jh );
-        jh512_4way( &ctx.jh, vhash, 64 );
+        jh512_4way_update( &ctx.jh, vhash, 64 );
         jh512_4way_close( &ctx.jh, vhashB );
      }
 
@@ -990,7 +990,6 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      sph_shavite512 ( &ctx.shavite, hash3, 64 );
      sph_shavite512_close( &ctx.shavite, hash3 );
 
-
      intrlv_2x128_512( vhashA, hash0, hash1 );
      intrlv_2x128_512( vhashB, hash2, hash3 );
 
@@ -1042,7 +1041,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits1( vh_mask ) )
      {
         haval256_5_4way_init( &ctx.haval );
-        haval256_5_4way( &ctx.haval, vhash, 64 );
+        haval256_5_4way_update( &ctx.haval, vhash, 64 );
         haval256_5_4way_close( &ctx.haval, vhash );
         memset( &vhash[8<<2], 0, 32<<2 );
         rintrlv_4x32_4x64( vhashB, vhash, 512 );
@@ -1068,7 +1067,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
      
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, vhash, 64 );
+     blake512_4way_update( &ctx.blake, vhash, 64 );
      blake512_4way_close( &ctx.blake, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -1130,7 +1129,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
    hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_4way_close( &ctx.hamsi, vhash );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -1214,7 +1213,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
 
    shabal512_4way_init( &ctx.shabal );
-   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_update( &ctx.shabal, vhash, 64 );
    shabal512_4way_close( &ctx.shabal, vhash );
 
    dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -1269,7 +1268,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    if ( mm256_anybits1( vh_mask ) )
    {
       sha512_4way_init( &ctx.sha512 );
-      sha512_4way( &ctx.sha512, vhash, 64 );
+      sha512_4way_update( &ctx.sha512, vhash, 64 );
       sha512_4way_close( &ctx.sha512, vhashB );
    }
 
@@ -1289,7 +1288,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
    sha512_4way_init( &ctx.sha512 ); 
-   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_update( &ctx.sha512, vhash, 64 );
    sha512_4way_close( &ctx.sha512, vhash ); 
 
 // A = haval parallel, B = Whirlpool serial
@@ -1305,7 +1304,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    if ( mm256_anybits0( vh_mask ) )
    {
       haval256_5_4way_init( &ctx.haval );
-      haval256_5_4way( &ctx.haval, vhash, 64 );
+      haval256_5_4way_update( &ctx.haval, vhash, 64 );
       haval256_5_4way_close( &ctx.haval, vhash );
       memset( &vhash[8<<2], 0, 32<<2 );
       rintrlv_4x32_4x64( vhashA, vhash, 512 );
@@ -1341,7 +1340,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
    bmw512_4way_init( &ctx.bmw );
-   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
 
  	memcpy(state, vhash, 32<<2 );
diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c
index 180d636..aa3fc0f 100644
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -289,10 +289,10 @@ void quark_4way_hash( void *state, const void *input )
 
     memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
 
-    blake512_4way( &ctx.blake, input, 80 );
+    blake512_4way_update( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );
 
-    bmw512_4way( &ctx.bmw, vhash, 64 );
+    bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -327,7 +327,7 @@ void quark_4way_hash( void *state, const void *input )
 
     if ( mm256_anybits1( vh_mask ) )   
     {
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
        skein512_4way_close( &ctx.skein, vhashB );
     }
 
@@ -346,7 +346,7 @@ void quark_4way_hash( void *state, const void *input )
 
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -354,24 +354,24 @@ void quark_4way_hash( void *state, const void *input )
     if ( mm256_anybits0( vh_mask ) )   
     {
        blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
        blake512_4way_close( &ctx.blake, vhashA );
     }
 
     if ( mm256_anybits1( vh_mask ) )
     {
        bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
     }
 
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 
     skein512_4way_init( &ctx.skein );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -379,14 +379,14 @@ void quark_4way_hash( void *state, const void *input )
     if ( mm256_anybits0( vh_mask ) )    
     {
        keccak512_4way_init( &ctx.keccak );
-       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_update( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
     }
 
     if ( mm256_anybits1( vh_mask ) )
     {
        jh512_4way_init( &ctx.jh );
-       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_update( &ctx.jh, vhash, 64 );
        jh512_4way_close( &ctx.jh, vhashB );
     }
 
diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c
index 78a6f5a..0228c86 100644
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -7,7 +7,7 @@
 #include "ripemd-hash-4way.h"
 
 #define LBRY_INPUT_SIZE 112
-#define LBRY_MIDSTATE    64
+#define LBRY_MIDSTATE    96
 #define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)
 
 #if defined(LBRY_16WAY)
@@ -35,9 +35,9 @@ void lbry_16way_hash( void* output, const void* input )
    uint32_t _ALIGN(64) h13[32];
    uint32_t _ALIGN(64) h14[32];
    uint32_t _ALIGN(64) h15[32];
-   sha256_16way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha256_16way_context    ctx_sha256 __attribute__ ((aligned (64)));
    sha512_8way_context     ctx_sha512;
-   ripemd160_16way_context  ctx_ripemd;
+   ripemd160_16way_context ctx_ripemd;
 
    memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
    sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
@@ -62,7 +62,7 @@ void lbry_16way_hash( void* output, const void* input )
    sha512_8way_close( &ctx_sha512, vhashB );
 
    // back to 8-way 32 bit
-   dintrlv_8x64( h0, h1, h2, h3,h4, h5, h6, h7, vhashA, 512 );
+   dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
    dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
    intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
                          h8, h9, h10, h11, h12, h13, h14, h15, 512 );
@@ -90,14 +90,15 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
 {
    uint32_t hash[8*16] __attribute__ ((aligned (128)));
    uint32_t vdata[32*16] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t edata[32] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[7<<4]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[27];
    const uint32_t first_nonce = pdata[27];
+   const uint32_t last_nonce = max_nonce - 16;
    const uint32_t Htarg = ptarget[7];
-   uint32_t edata[32] __attribute__ ((aligned (64)));
    __m512i  *noncev = (__m512i*)vdata + 27;   // aligned
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
@@ -114,14 +115,13 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
         edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
 
    sha256_16way_init( &sha256_16w_mid );
-   sha256_16way( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+   sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
 
    do
    {
-      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
-                                                  n+11, n+10, n+ 9, n+ 8,
-                                                  n+ 7, n+ 6, n+ 5, n+ 4,
-                                                  n+ 3, n+ 2, n+ 1, n ) );
+      *noncev = mm512_bswap_32( _mm512_set_epi32(
+                         n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                         n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
       lbry_16way_hash( hash, vdata );
 
       for ( int i = 0; i < 16; i++ )
@@ -129,27 +129,25 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
       {
          // deinterleave hash for lane
          extr_lane_16x32( lane_hash, hash, i, 256 );
-         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
          {
             pdata[27] = n + i;
             submit_lane_solution( work, lane_hash, mythr, i );
          }
       }
       n += 16;
-   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
-
-
 #elif defined(LBRY_8WAY)
 
 static __thread sha256_8way_context sha256_8w_mid;
 
 void lbry_8way_hash( void* output, const void* input )
 {
-   uint32_t _ALIGN(64) vhashA[16<<3];
+   uint32_t _ALIGN(128) vhashA[16<<3];
    uint32_t _ALIGN(64) vhashB[16<<3];
    uint32_t _ALIGN(64) vhashC[16<<3];
    uint32_t _ALIGN(32) h0[32];
@@ -165,11 +163,11 @@ void lbry_8way_hash( void* output, const void* input )
    ripemd160_8way_context  ctx_ripemd;
 
    memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
-   sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
+   sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
    sha256_8way_close( &ctx_sha256, vhashA );
 
    sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_update( &ctx_sha256, vhashA, 32 );
    sha256_8way_close( &ctx_sha256, vhashA );
 
    // reinterleave to do sha512 4-way 64 bit twice.
@@ -178,11 +176,11 @@ void lbry_8way_hash( void* output, const void* input )
    intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
 
    sha512_4way_init( &ctx_sha512 );
-   sha512_4way( &ctx_sha512, vhashA, 32 );
+   sha512_4way_update( &ctx_sha512, vhashA, 32 );
    sha512_4way_close( &ctx_sha512, vhashA );
 
    sha512_4way_init( &ctx_sha512 );
-   sha512_4way( &ctx_sha512, vhashB, 32 );
+   sha512_4way_update( &ctx_sha512, vhashB, 32 );
    sha512_4way_close( &ctx_sha512, vhashB );
 
    // back to 8-way 32 bit
@@ -191,20 +189,20 @@ void lbry_8way_hash( void* output, const void* input )
    intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
 
    ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
+   ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
    ripemd160_8way_close( &ctx_ripemd, vhashB );
 
    ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 );
+   ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
    ripemd160_8way_close( &ctx_ripemd, vhashC );
 
    sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashB, 20 );
-   sha256_8way( &ctx_sha256, vhashC, 20 );
+   sha256_8way_update( &ctx_sha256, vhashB, 20 );
+   sha256_8way_update( &ctx_sha256, vhashC, 20 );
    sha256_8way_close( &ctx_sha256, vhashA );
 
    sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_update( &ctx_sha256, vhashA, 32 );
    sha256_8way_close( &ctx_sha256, output );
 }
 
@@ -214,13 +212,13 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
    uint32_t hash[8*8] __attribute__ ((aligned (64)));
    uint32_t vdata[32*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t edata[32] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[7<<3]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[27];
    const uint32_t first_nonce = pdata[27];
    const uint32_t Htarg = ptarget[7];
-   uint32_t edata[32] __attribute__ ((aligned (64)));
    __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
@@ -237,7 +235,7 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
                        edata, edata, edata, edata, 1024 );
 
    sha256_8way_init( &sha256_8w_mid );
-   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
+   sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
 
    do
    {
diff --git a/algo/ripemd/lbry-gate.c b/algo/ripemd/lbry-gate.c
index ac94c64..f4080a8 100644
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -98,7 +98,7 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
 
 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+//  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #if defined (LBRY_16WAY)
   gate->scanhash              = (void*)&scanhash_lbry_16way;
   gate->hash                  = (void*)&lbry_16way_hash;
diff --git a/algo/ripemd/lbry-gate.h b/algo/ripemd/lbry-gate.h
index 603b5b5..2aedd6b 100644
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -5,11 +5,10 @@
 #include <stdint.h>
 
 
-// 16 way needs sha256 16 way
-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-//  #define LBRY_16WAY
-#if defined(__AVX2__)
-  #define LBRY_8WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LBRY_16WAY 1
+#elif defined(__AVX2__)
+  #define LBRY_8WAY 1
 #endif
 /*
 #if !defined(__SHA__)
@@ -37,13 +36,13 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-/*
+
 #elif defined(LBRY_4WAY)
 
 void lbry_4way_hash( void *state, const void *input );
 int scanhash_lbry_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done );
-*/
+
 #else
 
 void lbry_hash( void *state, const void *input );
diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c
index 42c0d2d..38de159 100644
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -259,7 +259,8 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
    sc->count_high = sc->count_low = 0;
 }
 
-void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
+void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+                            size_t len )
 {
    __m128i *vdata = (__m128i*)data;
    size_t ptr;
@@ -559,7 +560,8 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
    sc->count_high = sc->count_low = 0;
 }
 
-void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len )
+void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+                            size_t len )
 {
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
@@ -859,7 +861,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc )
    sc->count_high = sc->count_low = 0;
 }
 
-void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
                       size_t len )
 {
    __m512i *vdata = (__m512i*)data;
diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h
index c565ad7..71fb3d7 100644
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -16,7 +16,8 @@ typedef struct
 } __attribute__ ((aligned (64))) ripemd160_4way_context;
 
 void ripemd160_4way_init( ripemd160_4way_context *sc );
-void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
+void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+                            size_t len );
 void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
 
 #if defined (__AVX2__)
@@ -26,10 +27,11 @@ typedef struct
    __m256i buf[64>>2];
    __m256i val[5];
    uint32_t count_high, count_low;
-} __attribute__ ((aligned (64))) ripemd160_8way_context;
+} __attribute__ ((aligned (128))) ripemd160_8way_context;
 
 void ripemd160_8way_init( ripemd160_8way_context *sc );
-void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
+void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+                            size_t len );
 void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -42,7 +44,7 @@ typedef struct
 } __attribute__ ((aligned (128))) ripemd160_16way_context;
 
 void ripemd160_16way_init( ripemd160_16way_context *sc );
-void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
                       size_t len );
 void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
 
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 2ac2a7e..3635dd9 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -41,13 +41,9 @@
 #define SHA2_HASH_4WAY_H__ 1
 
 #include <stddef.h>
-#include "sph_types.h"
 #include "simd-utils.h"
 
 #if defined(__SSE2__)
-//#if defined(__SSE4_2__)
-
-//#define SPH_SIZE_sha256   256
 
 // SHA-256 4 way
 
@@ -59,9 +55,12 @@ typedef struct {
 } sha256_4way_context __attribute__ ((aligned (64)));
 
 void sha256_4way_init( sha256_4way_context *sc );
-void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
+void sha256_4way_update( sha256_4way_context *sc, const void *data,
+                         size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 
+#endif  // SSE2
+
 #if defined (__AVX2__)
 
 // SHA-256 8 way
@@ -75,10 +74,28 @@ typedef struct {
 
 void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
-#define sha256_8way sha256_8way_update
 void sha256_8way_close( sha256_8way_context *sc, void *dst );
 
-//#define SPH_SIZE_sha512   512
+#endif  // AVX2
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+typedef struct {
+   __m512i buf[64>>2];
+   __m512i val[8];
+   uint32_t count_high, count_low;
+   bool initialized;
+} sha256_16way_context __attribute__ ((aligned (128)));
+
+void sha256_16way_init( sha256_16way_context *sc );
+void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
+void sha256_16way_close( sha256_16way_context *sc, void *dst );
+
+#endif // AVX512
+
+#if defined (__AVX2__)
 
 // SHA-512 4 way
 
@@ -92,9 +109,10 @@ typedef struct {
 void sha512_4way_init( sha512_4way_context *sc);
 void sha512_4way_update( sha512_4way_context *sc, const void *data,
                          size_t len );
-#define sha512_4way sha512_4way_update
 void sha512_4way_close( sha512_4way_context *sc, void *dst );
 
+#endif  // AVX2
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 // SHA-512 8 way
@@ -111,8 +129,6 @@ void sha512_8way_update( sha512_8way_context *sc, const void *data,
                          size_t len );
 void sha512_8way_close( sha512_8way_context *sc, void *dst );
 
-
 #endif  // AVX512
-#endif  // __AVX2__
-#endif  // __SSE2__
+
 #endif  // SHA256_4WAY_H__
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index ba6b952..2167407 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -39,47 +39,31 @@
 // SHA-256 32 bit
 
 /*
-static const sph_u32 H256[8] = {
-        SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
-        SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
-        SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
-        SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+static const uint32_t H256[8] =
+{
+   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 */
 
-static const sph_u32 K256[64] = {
-        SPH_C32(0x428A2F98), SPH_C32(0x71374491),
-        SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
-        SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
-        SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
-        SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
-        SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
-        SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
-        SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
-        SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
-        SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
-        SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
-        SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
-        SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
-        SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
-        SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
-        SPH_C32(0x06CA6351), SPH_C32(0x14292967),
-        SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
-        SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
-        SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
-        SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
-        SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
-        SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
-        SPH_C32(0xD192E819), SPH_C32(0xD6990624),
-        SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
-        SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
-        SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
-        SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
-        SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
-        SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
-        SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
-        SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
-        SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+static const uint32_t K256[64] =
+{
+   0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+   0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+   0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+   0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+   0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+   0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+   0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+   0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+   0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+   0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+   0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+   0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+   0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+   0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+   0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 
 // SHA-256 4 way
@@ -248,7 +232,7 @@ void sha256_4way_init( sha256_4way_context *sc )
 */
 }
 
-void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
+void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
 {
    __m128i *vdata = (__m128i*)data;
    size_t ptr;
@@ -273,7 +257,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
          ptr = 0;
       }
       clow = sc->count_low;
-      clow2 = SPH_T32( clow + clen );
+      clow2 = clow + clen;
       sc->count_low = clow2;
       if ( clow2 < clow )
          sc->count_high++;
@@ -306,10 +290,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
 
     sc->buf[ pad >> 2 ] =
                  mm128_bswap_32( m128_const1_32( high ) );
-//                 mm128_bswap_32( _mm_set1_epi32( high ) );
     sc->buf[ ( pad+4 ) >> 2 ] =
                  mm128_bswap_32( m128_const1_32( low ) );
-//                 mm128_bswap_32( _mm_set1_epi32( low ) );
     sha256_4way_round( sc, sc->buf, sc->val );
 
     mm128_block_bswap_32( dst, sc->val );
@@ -483,7 +465,7 @@ void sha256_8way_init( sha256_8way_context *sc )
 */
 }
 
-void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
 {
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
@@ -508,7 +490,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
          ptr = 0;
       }
       clow = sc->count_low;
-      clow2 = SPH_T32( clow + clen );
+      clow2 = clow + clen;
       sc->count_low = clow2;
       if ( clow2 < clow )
          sc->count_high++;
@@ -549,5 +531,233 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
     mm256_block_bswap_32( dst, sc->val );
 }
 
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+#define CHx16(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJx16(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG2_0x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
+
+#define BSG2_1x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
+
+#define SSG2_0x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) 
+
+#define SSG2_1x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
+
+#define SHA2x16_MEXP( a, b, c, d ) \
+     mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
+
+#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
+  T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+  D  = _mm512_add_epi32( D,  T1 ); \
+  H  = _mm512_add_epi32( T1, T2 ); \
+} while (0)
+
+static void
+sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   register  __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   mm512_block_bswap_32( W  , in   );
+   mm512_block_bswap_32( W+8, in+8 );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E6676A09E667 );
+      B = m512_const1_64( 0xBB67AE85BB67AE85 );
+      C = m512_const1_64( 0x3C6EF3723C6EF372 );
+      D = m512_const1_64( 0xA54FF53AA54FF53A );
+      E = m512_const1_64( 0x510E527F510E527F );
+      F = m512_const1_64( 0x9B05688C9B05688C );
+      G = m512_const1_64( 0x1F83D9AB1F83D9AB );
+      H = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   }
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi32( r[0], A );
+      r[1] = _mm512_add_epi32( r[1], B );
+      r[2] = _mm512_add_epi32( r[2], C );
+      r[3] = _mm512_add_epi32( r[3], D );
+      r[4] = _mm512_add_epi32( r[4], E );
+      r[5] = _mm512_add_epi32( r[5], F );
+      r[6] = _mm512_add_epi32( r[6], G );
+      r[7] = _mm512_add_epi32( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) );
+      r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) );
+      r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) );
+      r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) );
+      r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) );
+      r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) );
+      r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) );
+      r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) );
+   }
+}
+
+void sha256_16way_init( sha256_16way_context *sc )
+{
+   sc->initialized = false;
+   sc->count_high = sc->count_low = 0;
+}
+
+
+void sha256_16way_update( sha256_16way_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 64;
+
+   ptr = (unsigned)sc->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha256_16way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void sha256_16way_close( sha256_16way_context *sc, void *dst )
+{
+    unsigned ptr;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)sc->count_low & (buf_size - 1U);
+    sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+         sha256_16way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 2 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    sc->buf[ pad >> 2 ] =
+                 mm512_bswap_32( m512_const1_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] =
+                 mm512_bswap_32( m512_const1_32( low ) );
+
+    sha256_16way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_32( dst, sc->val );
+}
+
+#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c
index 41c3458..2cecfcc 100644
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -15,19 +15,19 @@ void sha256q_8way_hash( void* output, const void* input )
    sha256_8way_context ctx;
    memcpy( &ctx, &sha256_ctx8, sizeof ctx );
 
-   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_update( &ctx, input + (64<<3), 16 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, output );
 }
 
@@ -61,7 +61,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
    // Need big endian data
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
    sha256_8way_init( &sha256_ctx8 );
-   sha256_8way( &sha256_ctx8, vdata, 64 );
+   sha256_8way_update( &sha256_ctx8, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
@@ -108,19 +108,19 @@ void sha256q_4way_hash( void* output, const void* input )
    sha256_4way_context ctx;
    memcpy( &ctx, &sha256_ctx4, sizeof ctx );
 
-   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_update( &ctx, input + (64<<2), 16 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, output );
 }
 
@@ -154,7 +154,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
 
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
    sha256_4way_init( &sha256_ctx4 );
-   sha256_4way( &sha256_ctx4, vdata, 64 );
+   sha256_4way_update( &sha256_ctx4, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 5c4dd68..b48633b 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -15,15 +15,15 @@ void sha256t_8way_hash( void* output, const void* input )
    sha256_8way_context ctx;
    memcpy( &ctx, &sha256_ctx8, sizeof ctx );
 
-   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_update( &ctx, input + (64<<3), 16 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, output );
 }
 
@@ -59,7 +59,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    // Need big endian data
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
    sha256_8way_init( &sha256_ctx8 );
-   sha256_8way( &sha256_ctx8, vdata, 64 );
+   sha256_8way_update( &sha256_ctx8, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
@@ -101,15 +101,15 @@ void sha256t_4way_hash( void* output, const void* input )
    sha256_4way_context ctx;
    memcpy( &ctx, &sha256_ctx4, sizeof ctx );
 
-   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_update( &ctx, input + (64<<2), 16 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, output );
 }
 
@@ -143,7 +143,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
 
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
    sha256_4way_init( &sha256_ctx4 );
-   sha256_4way( &sha256_ctx4, vdata, 64 );
+   sha256_4way_update( &sha256_ctx4, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index 3ee8194..d056da0 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -37,55 +37,57 @@
 #include "sha-hash-4way.h"
 
 /*
-static const sph_u64 H512[8] = {
-        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-        SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-        SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-        SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+static const uit64_t H512[8] =
+{
+   0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+   0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+   0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+   0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };
 */
 
-static const sph_u64 K512[80] = {
-	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
-	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
-	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
-	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
-	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
-	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
-	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
-	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
-	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
-	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
-	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
-	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
-	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
-	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
-	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
-	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
-	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
-	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
-	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
-	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
-	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
-	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
-	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
-	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
-	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
-	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
-	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
-	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
-	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
-	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
-	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
-	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
-	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
-	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
-	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
-	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
-	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
-	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
-	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
-	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+static const uint64_t K512[80] =
+{
+	0x428A2F98D728AE22, 0x7137449123EF65CD,
+	0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+	0x3956C25BF348B538, 0x59F111F1B605D019,
+	0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+	0xD807AA98A3030242, 0x12835B0145706FBE,
+	0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
+	0x9BDC06A725C71235, 0xC19BF174CF692694,
+	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
+	0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
+	0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+	0x983E5152EE66DFAB, 0xA831C66D2DB43210,
+	0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+	0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
+	0x06CA6351E003826F, 0x142929670A0E6E70,
+	0x27B70A8546D22FFC, 0x2E1B21385C26C926,
+	0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
+	0x81C2C92E47EDAEE6, 0x92722C851482353B,
+	0xA2BFE8A14CF10364, 0xA81A664BBC423001,
+	0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+	0xD192E819D6EF5218, 0xD69906245565A910,
+	0xF40E35855771202A, 0x106AA07032BBD1B8,
+	0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
+	0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
+	0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+	0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
+	0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
+	0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+	0xCA273ECEEA26619C, 0xD186B8C721C0C207,
+	0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+	0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
+	0x113F9804BEF90DAE, 0x1B710B35131C471B,
+	0x28DB77F523047D84, 0x32CAAB7B40C72493,
+	0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
+	0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };
 
 
diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h
index c296f8c..0efec0b 100644
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -97,7 +97,7 @@ void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
 
 void shabal512_4way_init( void *cc );
 void shabal512_4way_update( void *cc, const void *data, size_t len );
-#define shabal512_4way shabal512_4way_update
+//#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                        void *dst );
diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c
index a992789..c040e15 100644
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -18,76 +18,18 @@ void skeinhash_8way( void *state, const void *input )
      uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
      skein512_8way_context ctx_skein;
 
-//#if defined(__SHA__)
-//     uint32_t hash0[16] __attribute__ ((aligned (64)));
-//     uint32_t hash1[16] __attribute__ ((aligned (64)));
-//     uint32_t hash2[16] __attribute__ ((aligned (64)));
-//     uint32_t hash3[16] __attribute__ ((aligned (64)));
-//     uint32_t hash4[16] __attribute__ ((aligned (64)));
-//     uint32_t hash5[16] __attribute__ ((aligned (64)));
-//     uint32_t hash6[16] __attribute__ ((aligned (64)));
-//     uint32_t hash7[16] __attribute__ ((aligned (64)));
-//     SHA256_CTX           ctx_sha256;
-//#else
      uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
      sha256_8way_context ctx_sha256;
-//#endif
 
      skein512_8way_init( &ctx_skein );
      skein512_8way_update( &ctx_skein, input, 80 );
      skein512_8way_close( &ctx_skein, vhash64 );
-/*
-#if defined(__SHA__)      
-     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash64, 512 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
-     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
-     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
-     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
-     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
-     SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
-     SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
-     SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
-     SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
-     
-     intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, 256 );
-#else
-*/
 
      rintrlv_8x64_8x32( vhash32, vhash64, 512 );
-//     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-//                   vhash64, 512 );
-//     intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-//                   hash7, 512 );
 
      sha256_8way_init( &ctx_sha256 );
-     sha256_8way( &ctx_sha256, vhash32, 64 );
+     sha256_8way_update( &ctx_sha256, vhash32, 64 );
      sha256_8way_close( &ctx_sha256, state );
-//#endif
 }
 
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
@@ -176,7 +118,7 @@ void skeinhash_4way( void *state, const void *input )
      rintrlv_4x64_4x32( vhash32, vhash64, 512 );
 
      sha256_4way_init( &ctx_sha256 );
-     sha256_4way( &ctx_sha256, vhash32, 64 );
+     sha256_4way_update( &ctx_sha256, vhash32, 64 );
      sha256_4way_close( &ctx_sha256, state );
 #endif
 }
diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h
index 4f828a1..3f58e95 100644
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -93,12 +93,12 @@ typedef sph_skein_4way_big_context skein256_4way_context;
 void skein512_4way_init( skein512_4way_context *sc );
 void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
-#define skein512_4way skein512_4way_update
+//#define skein512_4way skein512_4way_update
 
 void skein256_4way_init( skein256_4way_context *sc );
 void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
-#define skein256_4way skein256_4way_update
+//#define skein256_4way skein256_4way_update
 
 #ifdef __cplusplus
 }
diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c
index a51508b..b2a7962 100644
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -68,11 +68,11 @@ void skein2hash_4way( void *output, const void *input )
    uint64_t hash[16*4] __attribute__ ((aligned (64)));
 
    skein512_4way_init( &ctx );
-   skein512_4way( &ctx, input, 80 );
+   skein512_4way_update( &ctx, input, 80 );
    skein512_4way_close( &ctx, hash );
 
    skein512_4way_init( &ctx );
-   skein512_4way( &ctx, hash, 64 );
+   skein512_4way_update( &ctx, hash, 64 );
    skein512_4way_close( &ctx, output );
 }
 
diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c
index f900aba..6e17d1b 100644
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -50,41 +50,138 @@
 #include <string.h>
 #include "sm3-hash-4way.h"
 
-#ifdef __SSE4_2__
+#ifdef __AVX2__
 
-void sm3_4way_init( sm3_4way_ctx_t *ctx )
+#define P0_8W(x) \
+   _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x,  9 ), \
+                                          mm256_rol_32( x, 17 ) ) ) 
+
+#define P1_8W(x) \
+   _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 15 ), \
+                                          mm256_rol_32( x, 23 ) ) ) 
+
+#define FF0_8W(x,y,z) \
+   _mm256_xor_si256( x, _mm256_xor_si256( y, z ) )
+
+#define FF1_8W(x,y,z) \
+   _mm256_or_si256( _mm256_or_si256( _mm256_and_si256( x, y ), \
+                                     _mm256_and_si256( x, z ) ), \
+                                     _mm256_and_si256( y, z ) )
+
+#define GG0_8W(x,y,z)  FF0_8W(x,y,z)
+
+#define GG1_8W(x,y,z) \
+   _mm256_or_si256( _mm256_and_si256( x, y ), \
+                    _mm256_andnot_si256( x, z ) )
+
+void sm3_8way_compress( __m256i *digest, __m256i *block )
 {
-	ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
-	ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
-	ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
-	ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
-	ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
-	ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
-	ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
-	ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
-	ctx->nblocks = 0;
-	ctx->num = 0;
+   __m256i W[68], W1[64];
+   __m256i A = digest[ 0 ];
+   __m256i B = digest[ 1 ];
+   __m256i C = digest[ 2 ];
+   __m256i D = digest[ 3 ];
+   __m256i E = digest[ 4 ];
+   __m256i F = digest[ 5 ];
+   __m256i G = digest[ 6 ];
+   __m256i H = digest[ 7 ];
+   __m256i SS1, SS2, TT1, TT2, T;
+   int j;
+
+   for ( j = 0; j < 16; j++ )
+      W[j] = mm256_bswap_32( block[j] );
+
+   for ( j = 16; j < 68; j++ )
+      W[j] = _mm256_xor_si256( P1_8W( _mm256_xor_si256(
+                                      _mm256_xor_si256( W[ j-16 ], W[ j-9 ] ),
+                                      mm256_rol_32( W[ j-3 ], 15 ) ) ),
+                  _mm256_xor_si256( mm256_rol_32( W[ j-13 ], 7 ), W[ j-6 ] ) );
+
+   for( j = 0; j < 64; j++ )
+       W1[j] = _mm256_xor_si256( W[j], W[j+4] );
+
+   T = _mm256_set1_epi32( 0x79CC4519UL );
+   for( j =0; j < 16; j++ )
+   {
+      SS1 = mm256_rol_32( _mm256_add_epi32( E, _mm256_add_epi32(
+                      mm256_rol_32( A, 12 ), mm256_rol_var_32( T, j ) ) ), 7 );
+      SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
+      TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       FF0_8W( A, B, C ), D ), SS2 ), W1[j] );
+      TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       GG0_8W( E, F, G ), H ), SS1 ), W[j] );
+      D = C;
+      C = mm256_rol_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm256_rol_32( F, 19 );
+      F = E;
+      E = P0_8W( TT2 );
+   }
+
+   T = _mm256_set1_epi32( 0x7A879D8AUL );
+   for( j =16; j < 64; j++ )
+   {
+      SS1 = mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32(
+                  mm256_rol_32(A,12), E ), mm256_rol_var_32( T, j&31 ) ), 7 );
+      SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
+      TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       FF1_8W( A, B, C ), D ), SS2 ), W1[j] );
+      TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       GG1_8W( E, F, G ), H ), SS1 ), W[j] );
+      D = C;
+      C = mm256_rol_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm256_rol_32( F, 19 );
+      F = E;
+      E = P0_8W( TT2 );
+   }
+
+   digest[0] = _mm256_xor_si256( digest[0], A );
+   digest[1] = _mm256_xor_si256( digest[1], B );
+   digest[2] = _mm256_xor_si256( digest[2], C );
+   digest[3] = _mm256_xor_si256( digest[3], D );
+   digest[4] = _mm256_xor_si256( digest[4], E );
+   digest[5] = _mm256_xor_si256( digest[5], F );
+   digest[6] = _mm256_xor_si256( digest[6], G );
+   digest[7] = _mm256_xor_si256( digest[7], H );
 }
 
-void sm3_4way( void *cc, const void *data, size_t len )
+void sm3_8way_init( sm3_8way_ctx_t *ctx )
 {
-   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
-   __m128i *block = (__m128i*)ctx->block;
-   __m128i *vdata = (__m128i*)data;
+   ctx->digest[0] = _mm256_set1_epi32( 0x7380166F );
+   ctx->digest[1] = _mm256_set1_epi32( 0x4914B2B9 );
+   ctx->digest[2] = _mm256_set1_epi32( 0x172442D7 );
+   ctx->digest[3] = _mm256_set1_epi32( 0xDA8A0600 );
+   ctx->digest[4] = _mm256_set1_epi32( 0xA96F30BC );
+   ctx->digest[5] = _mm256_set1_epi32( 0x163138AA );
+   ctx->digest[6] = _mm256_set1_epi32( 0xE38DEE4D );
+   ctx->digest[7] = _mm256_set1_epi32( 0xB0FB0E4E );
+   ctx->nblocks = 0;
+   ctx->num = 0;
+}
 
+void sm3_8way_update( void *cc, const void *data, size_t len )
+{
+   sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
+   __m256i *block = (__m256i*)ctx->block;
+   __m256i *vdata = (__m256i*)data;
    if ( ctx->num )
    {
       unsigned int left = SM3_BLOCK_SIZE - ctx->num;
       if ( len < left )
       {
-         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); 
+         memcpy_256( block + (ctx->num >> 2), vdata , len>>2 );
          ctx->num += len;
          return;
       }
       else
       {
-         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
-         sm3_4way_compress( ctx->digest, block );
+         memcpy_256( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_8way_compress( ctx->digest, block );
          ctx->nblocks++;
          vdata += left>>2;
          len -= left;
@@ -92,49 +189,53 @@ void sm3_4way( void *cc, const void *data, size_t len )
    }
    while ( len >= SM3_BLOCK_SIZE )
    {
-      sm3_4way_compress( ctx->digest, vdata );
+      sm3_8way_compress( ctx->digest, vdata );
       ctx->nblocks++;
       vdata += SM3_BLOCK_SIZE>>2;
       len -= SM3_BLOCK_SIZE;
    }
    ctx->num = len;
    if ( len )
-      memcpy_128( block, vdata, len>>2 );
+      memcpy_256( block, vdata, len>>2 );
 }
 
-void sm3_4way_close( void *cc, void *dst )
+void sm3_8way_close( void *cc, void *dst )
 {
-   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
-   __m128i *hash = (__m128i*)dst;
-   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
-   __m128i *block = (__m128i*)ctx->block;
+   sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
+   __m256i *hash = (__m256i*)dst;
+   __m256i *count = (__m256i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m256i *block = (__m256i*)ctx->block;
    int i;
 
-   block[ctx->num] = _mm_set1_epi32( 0x80 );
+   block[ctx->num] = _mm256_set1_epi32( 0x80 );
 
    if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
    {
-      memset_zero_128( block + (ctx->num >> 2) + 1, 
-                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); 
+      memset_zero_256( block + (ctx->num >> 2) + 1,
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
    }
    else
    {
-      memset_zero_128( block + (ctx->num >> 2) + 1, 
+      memset_zero_256( block + (ctx->num >> 2) + 1,
                              ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
-      sm3_4way_compress( ctx->digest, block );
-      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+      sm3_8way_compress( ctx->digest, block );
+      memset_zero_256( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
    }
 
-   count[0] = mm128_bswap_32(
-                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+   count[0] = mm256_bswap_32(
+                  _mm256_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm256_bswap_32( _mm256_set1_epi32( ( ctx->nblocks << 9 ) +
                                               ( ctx->num     << 3 ) ) );
-   sm3_4way_compress( ctx->digest, block );
+   sm3_8way_compress( ctx->digest, block );
 
    for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm128_bswap_32( ctx->digest[i] );
+     hash[i] = mm256_bswap_32( ctx->digest[i] );
 }
 
+#endif
+
+#if defined(__SSE2__)
+
 #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
                                                mm128_rol_32( x, 17 ) ) ) 
 #define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
@@ -227,5 +328,88 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
    digest[7] = _mm_xor_si128( digest[7], H );
 }
 
+void sm3_4way_init( sm3_4way_ctx_t *ctx )
+{
+   ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
+   ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
+   ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
+   ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
+   ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
+   ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
+   ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
+   ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
+   ctx->nblocks = 0;
+   ctx->num = 0;
+}
+
+void sm3_4way_update( void *cc, const void *data, size_t len )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *block = (__m128i*)ctx->block;
+   __m128i *vdata = (__m128i*)data;
+
+   if ( ctx->num )
+   {
+      unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+      if ( len < left )
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
+         ctx->num += len;
+         return;
+      }
+      else
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_4way_compress( ctx->digest, block );
+         ctx->nblocks++;
+         vdata += left>>2;
+         len -= left;
+      }
+   }
+   while ( len >= SM3_BLOCK_SIZE )
+   {
+      sm3_4way_compress( ctx->digest, vdata );
+      ctx->nblocks++;
+      vdata += SM3_BLOCK_SIZE>>2;
+      len -= SM3_BLOCK_SIZE;
+   }
+   ctx->num = len;
+   if ( len )
+      memcpy_128( block, vdata, len>>2 );
+}
+
+void sm3_4way_close( void *cc, void *dst )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *hash = (__m128i*)dst;
+   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m128i *block = (__m128i*)ctx->block;
+   int i;
+
+   block[ctx->num] = _mm_set1_epi32( 0x80 );
+
+   if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1,
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
+   }
+   else
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1,
+                             ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
+      sm3_4way_compress( ctx->digest, block );
+      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+   }
+
+   count[0] = mm128_bswap_32(
+                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                              ( ctx->num     << 3 ) ) );
+   sm3_4way_compress( ctx->digest, block );
+
+   for ( i = 0; i < 8 ; i++ )
+     hash[i] = mm128_bswap_32( ctx->digest[i] );
+}
+
 #endif
 
diff --git a/algo/sm3/sm3-hash-4way.h b/algo/sm3/sm3-hash-4way.h
index 06159d8..abe1dfd 100644
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -48,14 +48,13 @@
  */
 
 #ifndef SPH_SM3_HASH_4WAY_H
-#define SPH_SM3_HASH_4WAY_H
+#define SPH_SM3_HASH_4WAY_H 1
 
 #define SM3_DIGEST_LENGTH	32
 #define SM3_BLOCK_SIZE		64
 #define SM3_CBLOCK		(SM3_BLOCK_SIZE)
 #define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)
 
-
 #include <sys/types.h>
 #include <stdint.h>
 #include <string.h>
@@ -65,7 +64,6 @@
 extern "C" {
 #endif
 
-
 typedef struct {
    __m128i block[16] __attribute__ ((aligned (64)));
    __m128i digest[8];
@@ -74,15 +72,24 @@ typedef struct {
 } sm3_4way_ctx_t;
 
 void sm3_4way_init( sm3_4way_ctx_t *ctx );
-//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
-//                      size_t data_len );
-//void sm3_4way_final( sm3_4way_ctx_t *ctx,
-//                      unsigned char digest[SM3_DIGEST_LENGTH] );
-void sm3_4way_compress( __m128i *digest, __m128i *block );
-
-void sm3_4way(void *cc, const void *data, size_t len);
+void sm3_4way_update(void *cc, const void *data, size_t len);
 void sm3_4way_close(void *cc, void *dst);
 
+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i block[16] __attribute__ ((aligned (64)));
+   __m256i digest[8];
+   uint32_t nblocks;
+   uint32_t num;
+} sm3_8way_ctx_t;
+
+void sm3_8way_init( sm3_8way_ctx_t *ctx );
+void sm3_8way_update(void *cc, const void *data, size_t len);
+void sm3_8way_close(void *cc, void *dst);
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c
index fcae00c..aa9f5e9 100644
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -282,11 +282,11 @@ void c11_4way_hash( void *state, const void *input )
      memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
 
      // 1 Blake 4way
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -305,15 +305,15 @@ void c11_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 5 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // 6 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // Serial
diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c
index d1f51c5..94d36f7 100644
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -84,13 +84,13 @@ void timetravel_4way_hash(void *output, const void *input)
       switch ( permutation[i] )
       {
         case 0:
-           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_update( &ctx.blake, vhashA, dataLen );
            blake512_4way_close( &ctx.blake, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 1:
-           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
            bmw512_4way_close( &ctx.bmw, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -112,19 +112,19 @@ void timetravel_4way_hash(void *output, const void *input)
               intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
         break;
         case 3:
-           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_update( &ctx.skein, vhashA, dataLen );
            skein512_4way_close( &ctx.skein, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 4:
-           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_update( &ctx.jh, vhashA, dataLen );
            jh512_4way_close( &ctx.jh, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 5:
-           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
            keccak512_4way_close( &ctx.keccak, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c
index f4c016d..9353124 100644
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -90,13 +90,13 @@ void timetravel10_4way_hash(void *output, const void *input)
       switch ( permutation[i] )
       {
         case 0:
-           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_update( &ctx.blake, vhashA, dataLen );
            blake512_4way_close( &ctx.blake, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 1:
-           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
            bmw512_4way_close( &ctx.bmw, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -118,19 +118,19 @@ void timetravel10_4way_hash(void *output, const void *input)
               intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
         break;
         case 3:
-           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_update( &ctx.skein, vhashA, dataLen );
            skein512_4way_close( &ctx.skein, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 4:
-           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_update( &ctx.jh, vhashA, dataLen );
            jh512_4way_close( &ctx.jh, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 5:
-           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
            keccak512_4way_close( &ctx.keccak, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c
index a30cbc0..bdb680b 100644
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -282,11 +282,11 @@ void x11_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
 
      // 1 Blake 4way
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -305,15 +305,15 @@ void x11_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c
index 8fe1512..11e5366 100644
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -85,12 +85,12 @@ void x11evo_4way_hash( void *state, const void *input )
       switch ( idx )
       {
          case 0:
-            blake512_4way( &ctx.blake, input, 80 );
+            blake512_4way_update( &ctx.blake, input, 80 );
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
          break;
          case 1:
-            bmw512_4way( &ctx.bmw, vhash, 64 );
+            bmw512_4way_update( &ctx.bmw, vhash, 64 );
             bmw512_4way_close( &ctx.bmw, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
@@ -112,19 +112,19 @@ void x11evo_4way_hash( void *state, const void *input )
                intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 );
          break;
          case 3:
-            skein512_4way( &ctx.skein, vhash, 64 );
+            skein512_4way_update( &ctx.skein, vhash, 64 );
             skein512_4way_close( &ctx.skein, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
          break;
          case 4:
-            jh512_4way( &ctx.jh, vhash, 64 );
+            jh512_4way_update( &ctx.jh, vhash, 64 );
             jh512_4way_close( &ctx.jh, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
          break;
          case 5:
-            keccak512_4way( &ctx.keccak, vhash, 64 );
+            keccak512_4way_update( &ctx.keccak, vhash, 64 );
             keccak512_4way_close( &ctx.keccak, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c
index f3713d7..2158d31 100644
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -310,10 +310,10 @@ void x11gost_4way_hash( void *state, const void *input )
      x11gost_4way_ctx_holder ctx;
      memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
 
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -333,13 +333,13 @@ void x11gost_4way_hash( void *state, const void *input )
      // 4way
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial
diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c
index ed4d131..d735014 100644
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -272,10 +272,10 @@ void x12_4way_hash( void *state, const void *input )
      x12_4way_ctx_holder ctx;
      memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
 
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
@@ -328,16 +328,16 @@ void x12_4way_hash( void *state, const void *input )
 
      // Parallel 4way 64 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
diff --git a/algo/x13/phi1612-4way.c b/algo/x13/phi1612-4way.c
index 7750e75..33e00b4 100644
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -225,11 +225,11 @@ void phi1612_4way_hash( void *state, const void *input )
      memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
 
      // Skein parallel 4way
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // Serial to the end
diff --git a/algo/x13/skunk-4way.c b/algo/x13/skunk-4way.c
index 81899d0..566f545 100644
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -168,7 +168,7 @@ void skunk_4way_hash( void *output, const void *input )
      skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
      memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
 
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c
index 40b4b5b..3a372eb 100644
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -321,11 +321,11 @@ void x13_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
 
      // 1 Blake
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -344,15 +344,15 @@ void x13_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial
@@ -416,7 +416,7 @@ void x13_4way_hash( void *state, const void *input )
 
      // 12 Hamsi parallel 4way 32 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x13/x13bcd-4way.c b/algo/x13/x13bcd-4way.c
index 706ea6f..c372dd1 100644
--- a/algo/x13/x13bcd-4way.c
+++ b/algo/x13/x13bcd-4way.c
@@ -1,7 +1,4 @@
 #include "x13sm3-gate.h"
-
-#if defined(X13SM3_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -20,6 +17,281 @@
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 
+#if defined(X13BCD_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    sm3_8way_ctx_t          sm3;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+} x13bcd_8way_ctx_holder;
+
+x13bcd_8way_ctx_holder x13bcd_8way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_8way_context x13bcd_8way_ctx_mid;
+
+void init_x13bcd_8way_ctx()
+{
+     blake512_8way_init( &x13bcd_8way_ctx.blake );
+     bmw512_8way_init( &x13bcd_8way_ctx.bmw );
+     init_groestl( &x13bcd_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x13bcd_8way_ctx.skein );
+     jh512_8way_init( &x13bcd_8way_ctx.jh );
+     keccak512_8way_init( &x13bcd_8way_ctx.keccak );
+     cubehashInit( &x13bcd_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13bcd_8way_ctx.shavite );
+     simd_4way_init( &x13bcd_8way_ctx.simd, 512 );
+     init_echo( &x13bcd_8way_ctx.echo, 512 );
+     sm3_8way_init( &x13bcd_8way_ctx.sm3 );
+     hamsi512_8way_init( &x13bcd_8way_ctx.hamsi );
+     sph_fugue512_init( &x13bcd_8way_ctx.fugue );
+};
+
+void x13bcd_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x13bcd_8way_ctx_holder ctx;
+     memcpy( &ctx, &x13bcd_8way_ctx, sizeof(x13bcd_8way_ctx) );
+
+     // Blake
+     memcpy( &ctx.blake, &x13bcd_8way_ctx_mid, sizeof(x13bcd_8way_ctx_mid) );
+     blake512_8way_update( &ctx.blake, input + (64<<3), 16 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serial
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, 
+                       hash4, hash5, hash6, hash7, vhash );
+                       
+     // Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // Parallel 4way
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7 );
+
+     // Skein
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // SM3 parallel 32 bit
+     rintrlv_8x64_8x32( vhashA, vhash, 512 );
+     memset( vhash, 0, sizeof vhash );
+     sm3_8way_update( &ctx.sm3, vhashA, 64 );
+     sm3_8way_close( &ctx.sm3, vhash );
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, vhash );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+     memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*) hash4, 64 );
+     memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*) hash5, 64 );
+     memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*) hash6, 64 );
+     memcpy( &ctx.cube, &x13bcd_8way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*) hash7, 64 );
+
+     // Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // Simd
+     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
+     intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     // Hamsi parallel 4x32x2
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, vhash );
+
+     // Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, state );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, state+32 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, state+64 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, state+96 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, state+128 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, state+160 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, state+192 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, state+224 );
+}
+
+int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  // thr_id arg is deprecated
+     const uint32_t Htarg = ptarget[7];
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     blake512_8way_init( &x13bcd_8way_ctx_mid );
+     blake512_8way_update( &x13bcd_8way_ctx_mid, vdata, 64 );
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+        _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x13bcd_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+              pdata[19] = n+i;
+              submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined(X13BCD_4WAY)
+
 typedef struct {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
@@ -68,11 +340,11 @@ void x13bcd_4way_hash( void *state, const void *input )
 
      // Blake
      memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
-     blake512_4way( &ctx.blake, input + (64<<2), 16 );
+     blake512_4way_update( &ctx.blake, input + (64<<2), 16 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -91,15 +363,15 @@ void x13bcd_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -118,7 +390,7 @@ void x13bcd_4way_hash( void *state, const void *input )
      uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
      memset( sm3_hash3, 0, sizeof sm3_hash3 );
 
-     sm3_4way( &ctx.sm3, vhash, 64 );
+     sm3_4way_update( &ctx.sm3, vhash, 64 );
      sm3_4way_close( &ctx.sm3, sm3_vhash );
      dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
 
@@ -171,20 +443,23 @@ void x13bcd_4way_hash( void *state, const void *input )
 
      // Hamsi parallel 4x32x2
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // Fugue serial
      sph_fugue512( &ctx.fugue, hash0, 64 );
      sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
      sph_fugue512( &ctx.fugue, hash1, 64 );
      sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
      sph_fugue512( &ctx.fugue, hash2, 64 );
      sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
      sph_fugue512( &ctx.fugue, hash3, 64 );
      sph_fugue512_close( &ctx.fugue, hash3 );
 
@@ -203,44 +478,33 @@ int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
      uint32_t *ptarget = work->target;
      uint32_t n = pdata[19];
      const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 4;
      __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;
      const uint32_t Htarg = ptarget[7];
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
 
      mm256_bswap32_intrlv80_4x64( vdata, pdata );
 
      blake512_4way_init( &x13bcd_ctx_mid );
      blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
+     do
+     {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
 
-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
-       {
-         uint32_t mask = masks[m];
-         do
-         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+        x13bcd_4way_hash( hash, vdata );
+        pdata[19] = n;
 
-            x13bcd_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            for ( int i = 0; i < 4; i++ )
-            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) )
-            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-              submit_lane_solution( work, hash+(i<<3), mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
-
-     *hashes_done = n - first_nonce + 1;
+        for ( int i = 0; i < 4; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+            pdata[19] = n+i;
+            submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
      return 0;
 }
 
diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c
index a107627..9cafa76 100644
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -71,13 +71,11 @@ void x13sm3_4way_hash( void *state, const void *input )
 
      // Blake
      memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
-     blake512_4way( &ctx.blake, input + (64<<2), 16 );
-
-//     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input + (64<<2), 16 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -96,15 +94,15 @@ void x13sm3_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial to the end
@@ -180,13 +178,13 @@ void x13sm3_4way_hash( void *state, const void *input )
      uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
      memset( sm3_hash3, 0, sizeof sm3_hash3 );
 
-     sm3_4way( &ctx.sm3, vhash, 64 );
+     sm3_4way_update( &ctx.sm3, vhash, 64 );
      sm3_4way_close( &ctx.sm3, sm3_vhash );
      dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
 
      // Hamsi parallel 4x32x2
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x13/x13sm3-gate.c b/algo/x13/x13sm3-gate.c
index bc0fb92..c7a68fe 100644
--- a/algo/x13/x13sm3-gate.c
+++ b/algo/x13/x13sm3-gate.c
@@ -17,7 +17,11 @@ bool register_x13sm3_algo( algo_gate_t* gate )
 
 bool register_x13bcd_algo( algo_gate_t* gate )
 {
-#if defined (X13SM3_4WAY)
+#if defined (X13BCD_8WAY)
+  init_x13bcd_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13bcd_8way;
+  gate->hash      = (void*)&x13bcd_8way_hash;
+#elif defined (X13BCD_4WAY)
   init_x13bcd_4way_ctx();
   gate->scanhash  = (void*)&scanhash_x13bcd_4way;
   gate->hash      = (void*)&x13bcd_4way_hash;
@@ -26,7 +30,7 @@ bool register_x13bcd_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x13bcd;
   gate->hash      = (void*)&x13bcd_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   return true;
 };
 
diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h
index f0047bf..fc6154a 100644
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -5,13 +5,11 @@
 #include <stdint.h>
 
 #if defined(__AVX2__) && defined(__AES__)
-  #define X13SM3_4WAY
+  #define X13SM3_4WAY 1
 #endif
 
 bool register_x13sm3_algo( algo_gate_t* gate );
 
-bool register_x13bcd_algo( algo_gate_t* gate );
-
 #if defined(X13SM3_4WAY)
 
 void x13sm3_4way_hash( void *state, const void *input );
@@ -19,18 +17,39 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13sm3_4way_ctx();
 
-void x13bcd_4way_hash( void *state, const void *input );
-int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr );
-void init_x13bcd_4way_ctx();
-
-#endif
+#else
 
 void x13sm3_hash( void *state, const void *input );
 int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13sm3_ctx();
 
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X13BCD_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X13BCD_4WAY 1
+#endif
+
+bool register_x13bcd_algo( algo_gate_t* gate );
+
+#if defined(X13BCD_8WAY)
+
+void x13bcd_8way_hash( void *state, const void *input );
+int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13bcd_8way_ctx();
+
+#elif defined(X13BCD_4WAY)
+
+void x13bcd_4way_hash( void *state, const void *input );
+int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13bcd_4way_ctx();
+
+#else
+
 void x13bcd_hash( void *state, const void *input );
 int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
@@ -38,3 +57,4 @@ void init_x13bcd_ctx();
 
 #endif
 
+#endif
diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c
index 3e1cc69..09f99b1 100644
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -34,14 +34,14 @@ void polytimos_4way_hash( void *output, const void *input )
      poly_4way_context_overlay ctx;
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // Need to convert from 64 bit interleaved to 32 bit interleaved.
      uint32_t vhash32[16*4];
      rintrlv_4x64_4x32( vhash32, vhash, 512 );
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash32, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash32, 64 );
      shabal512_4way_close( &ctx.shabal, vhash32 );
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
 
diff --git a/algo/x14/veltor-4way.c b/algo/x14/veltor-4way.c
index 4f35161..1f8ea39 100644
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -38,7 +38,7 @@ void veltor_4way_hash( void *output, const void *input )
      veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
      memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );
 
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
@@ -55,7 +55,7 @@ void veltor_4way_hash( void *output, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c
index 9de05d3..2b5ce64 100644
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -325,11 +325,11 @@ void x14_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
 
      // 1 Blake
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -348,15 +348,15 @@ void x14_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial
@@ -420,7 +420,7 @@ void x14_4way_hash( void *state, const void *input )
 
      // 12 Hamsi parallel 4way 32 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c
index a761af0..7431223 100644
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -374,11 +374,11 @@ void x15_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
 
      // 1 Blake
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -397,15 +397,15 @@ void x15_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial to the end
@@ -469,7 +469,7 @@ void x15_4way_hash( void *state, const void *input )
 
      // 12 Hamsi parallel 4way 32 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index d724c78..63170db 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -463,11 +463,11 @@ void x16r_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -475,11 +475,11 @@ void x16r_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -501,11 +501,11 @@ void x16r_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -513,11 +513,11 @@ void x16r_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -525,11 +525,11 @@ void x16r_4way_hash( void* output, const void* input )
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
             if ( i == 0 )
-               keccak512_4way( &ctx.keccak, input, size );
+               keccak512_4way_update( &ctx.keccak, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way( &ctx.keccak, vhash, size );
+               keccak512_4way_update( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -599,7 +599,7 @@ void x16r_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
@@ -620,7 +620,7 @@ void x16r_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
          break;
@@ -641,7 +641,7 @@ void x16r_4way_hash( void* output, const void* input )
          case SHA_512:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_update( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c
index 663f61e..03e27c8 100644
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -458,11 +458,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -470,11 +470,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -496,11 +496,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -508,11 +508,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -520,11 +520,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
             if ( i == 0 )
-               keccak512_4way( &ctx.keccak, input, size );
+               keccak512_4way_update( &ctx.keccak, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way( &ctx.keccak, vhash, size );
+               keccak512_4way_update( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -594,7 +594,7 @@ void x16rt_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -615,7 +615,7 @@ void x16rt_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -636,7 +636,7 @@ void x16rt_4way_hash( void* output, const void* input )
          case SHA_512:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_update( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c
index 7406138..3fd0e8e 100644
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -553,11 +553,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -565,11 +565,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -591,11 +591,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -603,11 +603,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -631,7 +631,7 @@ void x16rv2_4way_hash( void* output, const void* input )
 
              intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
              keccak512_4way_init( &ctx.keccak );
-             keccak512_4way( &ctx.keccak, vhash, 64 );
+             keccak512_4way_update( &ctx.keccak, vhash, 64 );
              keccak512_4way_close( &ctx.keccak, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -721,7 +721,7 @@ void x16rv2_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -742,7 +742,7 @@ void x16rv2_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -779,7 +779,7 @@ void x16rv2_4way_hash( void* output, const void* input )
  
              intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, 64 );
+             sha512_4way_update( &ctx.sha512, vhash, 64 );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c
index 7d98a00..2f7d022 100644
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -578,11 +578,11 @@ void x21s_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -590,11 +590,11 @@ void x21s_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -616,11 +616,11 @@ void x21s_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -628,11 +628,11 @@ void x21s_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -640,11 +640,11 @@ void x21s_4way_hash( void* output, const void* input )
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
             if ( i == 0 )
-               keccak512_4way( &ctx.keccak, input, size );
+               keccak512_4way_update( &ctx.keccak, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way( &ctx.keccak, vhash, size );
+               keccak512_4way_update( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -714,7 +714,7 @@ void x21s_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -735,7 +735,7 @@ void x21s_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -756,7 +756,7 @@ void x21s_4way_hash( void* output, const void* input )
          case SHA_512:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_update( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -767,7 +767,7 @@ void x21s_4way_hash( void* output, const void* input )
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );
 
    haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhash, 64 );
+   haval256_5_4way_update( &ctx.haval, vhash, 64 );
    haval256_5_4way_close( &ctx.haval, vhash );
 
    dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -831,7 +831,7 @@ void x21s_4way_hash( void* output, const void* input )
 
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
    sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhash, 64 );
+   sha256_4way_update( &ctx.sha256, vhash, 64 );
    sha256_4way_close( &ctx.sha256, vhash );
    dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );
 
diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c
index 3a0b248..aad94c1 100644
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -1319,7 +1319,7 @@ int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
-     const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t last_nonce = max_nonce - 8;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
@@ -1350,8 +1350,6 @@ int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
    return 0;
 }
 
-
-
 #elif defined(SONOA_4WAY)
 
 union _sonoa_4way_context_overlay
@@ -1391,11 +1389,11 @@ void sonoa_4way_hash( void *state, const void *input )
 // 1
 
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1412,15 +1410,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1466,7 +1464,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1483,15 +1481,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1535,13 +1533,13 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
 // 3
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1558,15 +1556,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1610,7 +1608,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1632,7 +1630,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1649,15 +1647,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1701,7 +1699,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1722,13 +1720,13 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      rintrlv_4x32_4x64( vhashB, vhash, 512 ); 
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1758,13 +1756,13 @@ void sonoa_4way_hash( void *state, const void *input )
      rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      rintrlv_4x64_4x32( vhashB, vhash,  512 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhashB, 64 );
+     shabal512_4way_update( &ctx.shabal, vhashB, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1781,15 +1779,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1833,7 +1831,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1854,7 +1852,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1877,7 +1875,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
      
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1894,15 +1892,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1946,7 +1944,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1967,7 +1965,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1988,7 +1986,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_update( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -2011,7 +2009,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -2028,15 +2026,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -2080,7 +2078,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -2101,7 +2099,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -2122,13 +2120,13 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_update( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );
 
      rintrlv_4x64_4x32( vhashB, vhash,  512 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashB, 64 );
+     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
      haval256_5_4way_close( &ctx.haval, state );
 }
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index 18eed41..db9c39a 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -360,12 +360,12 @@ void x17_4way_hash( void *state, const void *input )
 
      // 1 Blake parallel 4 way 64 bit
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serialize
@@ -386,17 +386,17 @@ void x17_4way_hash( void *state, const void *input )
 
      // 4 Skein parallel 4 way 64 bit 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // 7 Luffa  parallel 2 way 128 bit
@@ -428,7 +428,6 @@ void x17_4way_hash( void *state, const void *input )
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
-
      // 11 Echo serial
      init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -447,7 +446,7 @@ void x17_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -470,7 +469,7 @@ void x17_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -493,14 +492,14 @@ void x17_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_update( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );     
 
      // 17 Haval parallel 32 bit
      rintrlv_4x64_4x32( vhashB, vhash,  512 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashB, 64 );
+     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
      haval256_5_4way_close( &ctx.haval, state );
 }
 
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index 28bc1c2..9cfd4db 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -569,12 +569,12 @@ void xevan_4way_hash( void *output, const void *input )
      // parallel 4 way
 
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close(&ctx.blake, vhash);
      memset( &vhash[8<<2], 0, 64<<2 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -597,15 +597,15 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_update( &ctx.skein, vhash, dataLen );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_update( &ctx.jh, vhash, dataLen );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -649,7 +649,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -671,7 +671,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -693,13 +693,13 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, dataLen );
+     sha512_4way_update( &ctx.sha512, vhash, dataLen );
      sha512_4way_close( &ctx.sha512, vhash );
 
      rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashA, dataLen );
+     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
      haval256_5_4way_close( &ctx.haval, vhashA );
 
      rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );
@@ -707,11 +707,11 @@ void xevan_4way_hash( void *output, const void *input )
      memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
 
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, vhash, dataLen );
+     blake512_4way_update( &ctx.blake, vhash, dataLen );
      blake512_4way_close(&ctx.blake, vhash);
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -732,15 +732,15 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_update( &ctx.skein, vhash, dataLen );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_update( &ctx.jh, vhash, dataLen );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -784,7 +784,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -805,7 +805,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -826,13 +826,13 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, dataLen );
+     sha512_4way_update( &ctx.sha512, vhash, dataLen );
      sha512_4way_close( &ctx.sha512, vhash );
 
      rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashA, dataLen );
+     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
      haval256_5_4way_close( &ctx.haval, output );
 }
 
diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c
index 0d28285..699a642 100644
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -439,10 +439,8 @@ int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
    return 0;
 }
 
-
 #elif defined(X22I_4WAY)
 
-
 union _x22i_4way_ctx_overlay
 {
     blake512_4way_context   blake;
@@ -477,8 +475,6 @@ void x22i_4way_hash( void *output, const void *input )
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-
-//   unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
    unsigned char hashA0[64]    __attribute__((aligned(64))) = {0};
    unsigned char hashA1[64]    __attribute__((aligned(32))) = {0};
    unsigned char hashA2[64]    __attribute__((aligned(32))) = {0};
@@ -486,13 +482,12 @@ void x22i_4way_hash( void *output, const void *input )
    x22i_ctx_overlay ctx;
 
    blake512_4way_init( &ctx.blake );
-   blake512_4way( &ctx.blake, input, 80 );
+   blake512_4way_update( &ctx.blake, input, 80 );
    blake512_4way_close( &ctx.blake, vhash );
 
    bmw512_4way_init( &ctx.bmw );
-   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
-
    dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
    
    init_groestl( &ctx.groestl, 64 );
@@ -511,15 +506,15 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
    skein512_4way_init( &ctx.skein );
-   skein512_4way( &ctx.skein, vhash, 64 );
+   skein512_4way_update( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );
 
    jh512_4way_init( &ctx.jh );
-   jh512_4way( &ctx.jh, vhash, 64 );
+   jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
 
    keccak512_4way_init( &ctx.keccak );
-   keccak512_4way( &ctx.keccak, vhash, 64 );
+   keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
 
    rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -560,13 +555,11 @@ void x22i_4way_hash( void *output, const void *input )
    update_final_echo ( &ctx.echo, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 512 );
 
-
    intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
    hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_4way_close( &ctx.hamsi, vhash );
-
    dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
    sph_fugue512_init( &ctx.fugue );
@@ -585,9 +578,8 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
    shabal512_4way_init( &ctx.shabal );
-   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_update( &ctx.shabal, vhash, 64 );
    shabal512_4way_close( &ctx.shabal, vhash );
-
    dintrlv_4x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], vhash );
 
    sph_whirlpool_init( &ctx.whirlpool );
@@ -606,12 +598,10 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
 
    sha512_4way_init( &ctx.sha512 );
-   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_update( &ctx.sha512, vhash, 64 );
    sha512_4way_close( &ctx.sha512, vhash );
-
    dintrlv_4x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash );
 
-//	InitializeSWIFFTX();
 	ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
    ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
    ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
@@ -622,9 +612,8 @@ void x22i_4way_hash( void *output, const void *input )
    memset( vhash, 0, 64*4 );
 
    haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhashA, 64 );
+   haval256_5_4way_update( &ctx.haval, vhashA, 64 );
    haval256_5_4way_close( &ctx.haval, vhash );
-
    dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
      
 	memset( hashA0, 0, 64 );
@@ -675,10 +664,8 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
    sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhash, 64 );
+   sha256_4way_update( &ctx.sha256, vhash, 64 );
    sha256_4way_close( &ctx.sha256, output );
-   
-//	memcpy(output, hash, 32);
 }
 
 
diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c
index 893a0e3..4026588 100644
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -1,5 +1,9 @@
 #include "x22i-gate.h"
 
+// Ryzen has poor AVX2 performance so use SHA over AVX2.
+// Intel has AVX512 so use AVX512 over SHA.
+// When Ryzen AVX2 improves use AVX2 over SHA.
+
 bool register_x22i_algo( algo_gate_t* gate )
 {
 #if defined (X22I_8WAY)
@@ -23,17 +27,17 @@ bool register_x25x_algo( algo_gate_t* gate )
 #if defined (X25X_8WAY)
   gate->scanhash  = (void*)&scanhash_x25x_8way;
   gate->hash      = (void*)&x25x_8way_hash;
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
 #elif defined (X25X_4WAY)
   gate->scanhash  = (void*)&scanhash_x25x_4way;
   gate->hash      = (void*)&x25x_4way_hash;
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #else
   gate->scanhash  = (void*)&scanhash_x25x;
   gate->hash      = (void*)&x25x_hash;
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
 
   return true;
 };
diff --git a/algo/x22/x22i-gate.h b/algo/x22/x22i-gate.h
index 1dbb305..a03079f 100644
--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -34,13 +34,9 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce,
 
 #endif
 
-
-// Big problems with x25x 8 way. It blows up just by increasing the
-// buffer sizes and nothing else. It may have to do with accessing 2 dim arrays.
-
-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-//  #define X25X_8WAY 1
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X25X_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
   #define X25X_4WAY 1
 #endif
 
diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c
index df8f312..096747b 100644
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -1,7 +1,4 @@
 #include "x22i-gate.h"
-
-#if defined(X25X_4WAY)
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
@@ -16,8 +13,11 @@
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/tiger/sph_tiger.h"
@@ -27,33 +27,6 @@
 #include "algo/panama/sph_panama.h"
 #include "algo/lanehash/lane.h"
 
-union _x25x_4way_ctx_overlay
-{
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
-    hashState_groestl       groestl;
-    hashState_echo          echo;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    hashState_sd            simd;
-    hamsi512_4way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_4way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
-    sph_tiger_context       tiger;
-    sph_gost512_context     gost;
-    sha256_4way_context     sha256;
-    sph_panama_context      panama;
-     blake2s_4way_state           blake2s;
-};
-typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
-
 void x25x_shuffle( void *hash )
 {
    // Simple shuffle algorithm, instead of just reversing
@@ -81,28 +54,544 @@ void x25x_shuffle( void *hash )
    #undef X25X_SHUFFLE_ROUNDS
 }
 
-void x25x_4way_hash( void *output, const void *input )
+#if defined(X25X_8WAY)
+
+union _x25x_8way_ctx_overlay
 {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_8way_context     sha256;
+    sph_panama_context      panama;
+    blake2s_8way_state      blake2s;
+};
+typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;
+
+void x25x_8way_hash( void *output, const void *input )
+{
+   uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+   unsigned char hash0[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash1[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash4[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash5[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash6[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash7[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char vhashX[24][64*8] __attribute__ ((aligned (64)));
+   uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+   x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));
+
+   blake512_8way_init( &ctx.blake );
+   blake512_8way_update( &ctx.blake, input, 80 );
+   blake512_8way_close( &ctx.blake, vhash );
+   dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
+                     hash4[0], hash5[0], hash6[0], hash7[0], vhash );
+
+   bmw512_8way_init( &ctx.bmw );
+   bmw512_8way_update( &ctx.bmw, vhash, 64 );
+   bmw512_8way_close( &ctx.bmw, vhash );
+   dintrlv_8x64_512( hash0[1], hash1[1], hash2[1], hash3[1],
+                     hash4[1], hash5[1], hash6[1], hash7[1], vhash );
+
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0[2],
+                                  (const char*)hash0[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1[2],
+                                  (const char*)hash1[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2[2],
+                                  (const char*)hash2[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
+                                  (const char*)hash3[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash4[2],
+                                  (const char*)hash4[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash5[2],
+                                  (const char*)hash5[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash6[2],
+                                  (const char*)hash6[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash7[2],
+                                  (const char*)hash7[1], 512 );
+
+   intrlv_8x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2],
+                           hash4[2], hash5[2], hash6[2], hash7[2] );
+   
+   skein512_8way_init( &ctx.skein );
+   skein512_8way_update( &ctx.skein, vhash, 64 );
+   skein512_8way_close( &ctx.skein, vhash );
+   dintrlv_8x64_512( hash0[3], hash1[3], hash2[3], hash3[3],
+                     hash4[3], hash5[3], hash6[3], hash7[3], vhash );
+
+   jh512_8way_init( &ctx.jh );
+   jh512_8way_update( &ctx.jh, vhash, 64 );
+   jh512_8way_close( &ctx.jh, vhash );
+   dintrlv_8x64_512( hash0[4], hash1[4], hash2[4], hash3[4],
+                     hash4[4], hash5[4], hash6[4], hash7[4], vhash );
+
+   keccak512_8way_init( &ctx.keccak );
+   keccak512_8way_update( &ctx.keccak, vhash, 64 );
+   keccak512_8way_close( &ctx.keccak, vhash );
+   dintrlv_8x64_512( hash0[5], hash1[5], hash2[5], hash3[5],
+                     hash4[5], hash5[5], hash6[5], hash7[5], vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+   dintrlv_4x128_512( hash0[6], hash1[6], hash2[6], hash3[6], vhashA );
+   dintrlv_4x128_512( hash4[6], hash5[6], hash6[6], hash7[6], vhashB );
+
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+   dintrlv_4x128_512( hash0[7], hash1[7], hash2[7], hash3[7], vhashA );
+   dintrlv_4x128_512( hash4[7], hash5[7], hash6[7], hash7[7], vhashB );
+
+	sph_shavite512_init(&ctx.shavite);
+	sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+	sph_shavite512_close(&ctx.shavite, hash0[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash1[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash2[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash3[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash4[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash4[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash5[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash5[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash6[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash6[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash7[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash7[8]);
+
+   intrlv_4x128_512( vhashA, hash0[8], hash1[8], hash2[8], hash3[8] );
+   intrlv_4x128_512( vhashB, hash4[8], hash5[8], hash6[8], hash7[8] );
+
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+   dintrlv_4x128_512( hash0[9], hash1[9], hash2[9], hash3[9], vhashA );
+   dintrlv_4x128_512( hash4[9], hash5[9], hash6[9], hash7[9], vhashB );
+
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash0[10],
+                            (const BitSequence*)hash0[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash1[10],
+                            (const BitSequence*)hash1[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash2[10],
+                            (const BitSequence*)hash2[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash3[10],
+                            (const BitSequence*)hash3[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash4[10],
+                            (const BitSequence*)hash4[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash5[10],
+                            (const BitSequence*)hash5[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash6[10],
+                            (const BitSequence*)hash6[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash7[10],
+                            (const BitSequence*)hash7[9], 512 );
+
+   intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
+                           hash4[10], hash5[10], hash6[10], hash7[10] );
+
+   hamsi512_8way_init( &ctx.hamsi );
+   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8way_close( &ctx.hamsi, vhash );
+   dintrlv_8x64_512( hash0[11], hash1[11], hash2[11], hash3[11],
+                     hash4[11], hash5[11], hash6[11], hash7[11], vhash );
+   
+	sph_fugue512_init(&ctx.fugue);
+	sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
+	sph_fugue512_close(&ctx.fugue, hash0[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash1[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash2[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash2[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash3[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash3[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash4[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash4[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash5[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash5[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash6[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash6[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash7[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash7[12]);
+
+   intrlv_8x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12],
+                           hash4[12], hash5[12], hash6[12], hash7[12] );
+
+   shabal512_8way_init( &ctx.shabal );
+   shabal512_8way_update( &ctx.shabal, vhash, 64 );
+   shabal512_8way_close( &ctx.shabal, vhash );
+   dintrlv_8x32_512( hash0[13], hash1[13], hash2[13], hash3[13],
+                     hash4[13], hash5[13], hash6[13], hash7[13], vhash );
+
+	sph_whirlpool_init(&ctx.whirlpool);
+	sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
+	sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash2[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash2[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash3[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash3[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash4[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash4[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash5[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash5[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash6[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash6[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash7[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash7[14]);
+
+   intrlv_8x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14],
+                           hash4[14], hash5[14], hash6[14], hash7[14] );
+
+   sha512_8way_init( &ctx.sha512 );
+   sha512_8way_update( &ctx.sha512, vhash, 64 );
+   sha512_8way_close( &ctx.sha512, vhash );
+   dintrlv_8x64_512( hash0[15], hash1[15], hash2[15], hash3[15],
+                     hash4[15], hash5[15], hash6[15], hash7[15], vhash );
+
+   ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash3[12], (unsigned char*)hash3[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash4[12], (unsigned char*)hash4[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash5[12], (unsigned char*)hash5[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash6[12], (unsigned char*)hash6[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash7[12], (unsigned char*)hash7[16]);
+
+   intrlv_8x32_512( vhashA, hash0[16], hash1[16], hash2[16], hash3[16],
+                            hash4[16], hash5[16], hash6[16], hash7[16] );
+   memset( vhash, 0, 64*8 );
+
+   haval256_5_8way_init( &ctx.haval );
+   haval256_5_8way_update( &ctx.haval, vhashA, 64 );
+   haval256_5_8way_close( &ctx.haval, vhash );
+   dintrlv_8x32_512( hash0[17], hash1[17], hash2[17], hash3[17],
+                     hash4[17], hash5[17], hash6[17], hash7[17], vhash );
+
+	sph_tiger_init(&ctx.tiger);
+	sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
+	sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash2[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash2[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash3[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash3[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash4[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash4[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash5[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash5[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash6[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash6[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash7[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash7[18]);
+
+   intrlv_2x256( vhash, hash0[18], hash1[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0[19], hash1[19], vhash, 256 );
+   intrlv_2x256( vhash, hash2[18], hash3[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2[19], hash3[19], vhash, 256 );
+   intrlv_2x256( vhash, hash4[18], hash5[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4[19], hash5[19], vhash, 256 );
+   intrlv_2x256( vhash, hash6[18], hash7[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6[19], hash7[19], vhash, 256 );
+
+	sph_gost512_init(&ctx.gost);
+	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash1[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash2[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash2[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash3[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash3[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash4[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash4[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash5[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash5[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash6[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash6[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash7[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash7[20]);
+
+   intrlv_8x32_512( vhashA, hash0[20], hash1[20], hash2[20], hash3[20],
+                            hash4[20], hash5[20], hash6[20], hash7[20] );
+
+   sha256_8way_init( &ctx.sha256 );
+   sha256_8way_update( &ctx.sha256, vhashA, 64 );
+   sha256_8way_close( &ctx.sha256, vhash );
+   dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21],
+                     hash4[21], hash5[21], hash6[21], hash7[21], vhash );
+
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash0[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash0[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash1[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash1[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash2[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash2[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash3[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash3[22]);
+
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash4[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash4[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash5[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash5[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash6[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash6[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash7[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash7[22]);
+
+   laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]);
+   laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]);
+   laneHash(512, (const BitSequence*)hash2[22], 512, (BitSequence*)hash2[23]);
+   laneHash(512, (const BitSequence*)hash3[22], 512, (BitSequence*)hash3[23]);
+   laneHash(512, (const BitSequence*)hash4[22], 512, (BitSequence*)hash5[23]);
+   laneHash(512, (const BitSequence*)hash5[22], 512, (BitSequence*)hash5[23]);
+   laneHash(512, (const BitSequence*)hash6[22], 512, (BitSequence*)hash6[23]);
+   laneHash(512, (const BitSequence*)hash7[22], 512, (BitSequence*)hash7[23]);
+
+   x25x_shuffle( hash0 );
+   x25x_shuffle( hash1 );
+   x25x_shuffle( hash2 );
+   x25x_shuffle( hash3 );
+   x25x_shuffle( hash4 );
+   x25x_shuffle( hash5 );
+   x25x_shuffle( hash6 );
+   x25x_shuffle( hash7 );
+
+   intrlv_8x32_512( vhashX[ 0], hash0[ 0], hash1[ 0], hash2[ 0], hash3[ 0],
+                                hash4[ 0], hash5[ 0], hash6[ 0], hash7[ 0] );
+   intrlv_8x32_512( vhashX[ 1], hash0[ 1], hash1[ 1], hash2[ 1], hash3[ 1],
+                                hash4[ 1], hash5[ 1], hash6[ 1], hash7[ 1] );
+   intrlv_8x32_512( vhashX[ 2], hash0[ 2], hash1[ 2], hash2[ 2], hash3[ 2],
+                                hash4[ 2], hash5[ 2], hash6[ 2], hash7[ 2] );
+   intrlv_8x32_512( vhashX[ 3], hash0[ 3], hash1[ 3], hash2[ 3], hash3[ 3],
+                                hash4[ 3], hash5[ 3], hash6[ 3], hash7[ 3] );
+   intrlv_8x32_512( vhashX[ 4], hash0[ 4], hash1[ 4], hash2[ 4], hash3[ 4],
+                                hash4[ 4], hash5[ 4], hash6[ 4], hash7[ 4] );
+   intrlv_8x32_512( vhashX[ 5], hash0[ 5], hash1[ 5], hash2[ 5], hash3[ 5],
+                                hash4[ 5], hash5[ 5], hash6[ 5], hash7[ 5] );
+   intrlv_8x32_512( vhashX[ 6], hash0[ 6], hash1[ 6], hash2[ 6], hash3[ 6],
+                                hash4[ 6], hash5[ 6], hash6[ 6], hash7[ 6] );
+   intrlv_8x32_512( vhashX[ 7], hash0[ 7], hash1[ 7], hash2[ 7], hash3[ 7],
+                                hash4[ 7], hash5[ 7], hash6[ 7], hash7[ 7] );
+   intrlv_8x32_512( vhashX[ 8], hash0[ 8], hash1[ 8], hash2[ 8], hash3[ 8],
+                                hash4[ 8], hash5[ 8], hash6[ 8], hash7[ 8] );
+   intrlv_8x32_512( vhashX[ 9], hash0[ 9], hash1[ 9], hash2[ 9], hash3[ 9],
+                                hash4[ 9], hash5[ 9], hash6[ 9], hash7[ 9] );
+   intrlv_8x32_512( vhashX[10], hash0[10], hash1[10], hash2[10], hash3[10],
+                                hash4[10], hash5[10], hash6[10], hash7[10] );
+   intrlv_8x32_512( vhashX[11], hash0[11], hash1[11], hash2[11], hash3[11],
+                                hash4[11], hash5[11], hash6[11], hash7[11] );
+   intrlv_8x32_512( vhashX[12], hash0[12], hash1[12], hash2[12], hash3[12],
+                                hash4[12], hash5[12], hash6[12], hash7[12] );
+   intrlv_8x32_512( vhashX[13], hash0[13], hash1[13], hash2[13], hash3[13],
+                                hash4[13], hash5[13], hash6[13], hash7[13] );
+   intrlv_8x32_512( vhashX[14], hash0[14], hash1[14], hash2[14], hash3[14],
+                                hash4[14], hash5[14], hash6[14], hash7[14] );
+   intrlv_8x32_512( vhashX[15], hash0[15], hash1[15], hash2[15], hash3[15],
+                                hash4[15], hash5[15], hash6[15], hash7[15] );
+   intrlv_8x32_512( vhashX[16], hash0[16], hash1[16], hash2[16], hash3[16],
+                                hash4[16], hash5[16], hash6[16], hash7[16] );
+   intrlv_8x32_512( vhashX[17], hash0[17], hash1[17], hash2[17], hash3[17],
+                                hash4[17], hash5[17], hash6[17], hash7[17] );
+   intrlv_8x32_512( vhashX[18], hash0[18], hash1[18], hash2[18], hash3[18],
+                                hash4[18], hash5[18], hash6[18], hash7[18] );
+   intrlv_8x32_512( vhashX[19], hash0[19], hash1[19], hash2[19], hash3[19],
+                                hash4[19], hash5[19], hash6[19], hash7[19] );
+   intrlv_8x32_512( vhashX[20], hash0[20], hash1[20], hash2[20], hash3[20],
+                                hash4[20], hash5[20], hash6[20], hash7[20] );
+   intrlv_8x32_512( vhashX[21], hash0[21], hash1[21], hash2[21], hash3[21],
+                                hash4[21], hash5[21], hash6[21], hash7[21] );
+   intrlv_8x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22],
+                                hash4[22], hash5[22], hash6[22], hash7[22] );
+   intrlv_8x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23],
+                                hash4[23], hash5[23], hash6[23], hash7[23] );
+
+   blake2s_8way_init( &ctx.blake2s, 32 );
+   blake2s_8way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
+}
+
+int scanhash_x25x_8way( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 4;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   if (opt_benchmark)
+      ((uint32_t*)ptarget)[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      x25x_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(X25X_4WAY)
+
+union _x25x_4way_ctx_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    hashState_echo          echo;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    haval256_5_4way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_4way_context     sha256;
+    sph_panama_context      panama;
+    blake2s_4way_state      blake2s;
+};
+typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
+
+void x25x_4way_hash( void *output, const void *input )
+{
+   uint64_t vhash[8*4] __attribute__ ((aligned (128)));
    unsigned char hash0[25][64] __attribute__((aligned(64))) = {0};
    unsigned char hash1[25][64] __attribute__((aligned(64))) = {0};
    unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
    unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
-   uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-// Doubling the size of vhashX breaks everything. It may have something
-// to do with accessing arrays: vhashX vs vhashX[0] vs &vhash[0].
-// Changing notation did seem to allow the larger buffer but still resulted
-// in problems further along.
-//   unsigned char vhashX[24][64*8] __attribute__ ((aligned (64)));
    unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
    x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
 
    blake512_4way_init( &ctx.blake );
-   blake512_4way( &ctx.blake, input, 80 );
+   blake512_4way_update( &ctx.blake, input, 80 );
    blake512_4way_close( &ctx.blake, vhash );
    dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
 
    bmw512_4way_init( &ctx.bmw );
-   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
    dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );
 
@@ -118,24 +607,24 @@ void x25x_4way_hash( void *output, const void *input )
    init_groestl( &ctx.groestl, 64 );
    update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
                                   (const char*)hash3[1], 512 );
-   
+
    intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
 
    skein512_4way_init( &ctx.skein );
-   skein512_4way( &ctx.skein, vhash, 64 );
+   skein512_4way_update( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );
    dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );
 
    jh512_4way_init( &ctx.jh );
-   jh512_4way( &ctx.jh, vhash, 64 );
+   jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
    dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash );
 
    keccak512_4way_init( &ctx.keccak );
-   keccak512_4way( &ctx.keccak, vhash, 64 );
+   keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
    dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
-   
+
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0[6],
                                 (const BitSequence*)hash0[5], 64 );
@@ -162,9 +651,9 @@ void x25x_4way_hash( void *output, const void *input )
    cubehashUpdateDigest( &ctx.cube, (byte*) hash3[7],
                               (const byte*)hash3[6], 64 );
 
-	sph_shavite512_init(&ctx.shavite);
-	sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
-	sph_shavite512_close(&ctx.shavite, hash0[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash0[8]);
    sph_shavite512_init(&ctx.shavite);
    sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
    sph_shavite512_close(&ctx.shavite, hash1[8]);
@@ -204,13 +693,13 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );
 
    hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_4way_close( &ctx.hamsi, vhash );
    dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash );
 
-	sph_fugue512_init(&ctx.fugue);
-	sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
-	sph_fugue512_close(&ctx.fugue, hash0[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash0[12]);
    sph_fugue512_init(&ctx.fugue);
    sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64);
    sph_fugue512_close(&ctx.fugue, hash1[12]);
@@ -224,13 +713,13 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] );
 
    shabal512_4way_init( &ctx.shabal );
-   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_update( &ctx.shabal, vhash, 64 );
    shabal512_4way_close( &ctx.shabal, vhash );
    dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash );
 
-	sph_whirlpool_init(&ctx.whirlpool);
-	sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
-	sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
    sph_whirlpool_init(&ctx.whirlpool);
    sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
    sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
@@ -244,11 +733,10 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] );
 
    sha512_4way_init( &ctx.sha512 );
-   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_update( &ctx.sha512, vhash, 64 );
    sha512_4way_close( &ctx.sha512, vhash );
    dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash );
 
-
    ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
    ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]);
    ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]);
@@ -257,15 +745,15 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhashX[0], hash0[16], hash1[16], hash2[16], hash3[16] );
 
    memset( vhash, 0, 64*4 );
-   
+
    haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhashX[0], 64 );
+   haval256_5_4way_update( &ctx.haval, vhashX[0], 64 );
    haval256_5_4way_close( &ctx.haval, vhash );
    dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash );
 
-	sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
-	sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
    sph_tiger_init(&ctx.tiger);
    sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
    sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
@@ -276,7 +764,7 @@ void x25x_4way_hash( void *output, const void *input )
    sph_tiger (&ctx.tiger, (const void*) hash3[17], 64);
    sph_tiger_close(&ctx.tiger, (void*) hash3[18]);
 
-	LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32,
+   LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32,
             (const void*)hash0[18], 32, 1, 4, 4 );
    LYRA2RE( (void*)hash1[19], 32, (const void*)hash1[18], 32,
             (const void*)hash1[18], 32, 1, 4, 4 );
@@ -285,9 +773,9 @@ void x25x_4way_hash( void *output, const void *input )
    LYRA2RE( (void*)hash3[19], 32, (const void*)hash3[18], 32,
             (const void*)hash3[18], 32, 1, 4, 4 );
 
-	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
-	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash0[20]);
    sph_gost512_init(&ctx.gost);
    sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
    sph_gost512_close(&ctx.gost, (void*) hash1[20]);
@@ -302,7 +790,7 @@ void x25x_4way_hash( void *output, const void *input )
    memset( vhash, 0, 64*4 );
 
    sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhashX[0], 64 );
+   sha256_4way_update( &ctx.sha256, vhashX[0], 64 );
    sha256_4way_close( &ctx.sha256, vhash );
    dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );
 
@@ -356,20 +844,12 @@ void x25x_4way_hash( void *output, const void *input )
 
    blake2s_4way_init( &ctx.blake2s, 32 );
    blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
-/*
-   dintrlv_4x32( hash0[24], hash1[24], hash2[24], hash3[24], vhash, 256 );
-     
-	memcpy(output,    hash0[24], 32);
-   memcpy(output+32, hash1[24], 32);
-   memcpy(output+64, hash2[24], 32);
-   memcpy(output+96, hash3[24], 32);
-*/
 }
 
 int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (128)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint32_t *hash7 = &(hash[7<<2]);
@@ -401,17 +881,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
          {
               pdata[19] = n + lane;
               submit_lane_solution( work, lane_hash, mythr, lane );
-              }
+         }
       }
-/*
-      for ( int i = 0; i < 4; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
-      {
-         pdata[19] = n+i;
-         submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-*/
       n += 4;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
 
diff --git a/build-allarch.sh b/build-allarch.sh
index 6e8fd89..ea69c63 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen 
+rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen  > /dev/null
 
 make distclean || echo clean
 rm -f config.status
diff --git a/build-avx2.sh b/build-avx2.sh
new file mode 100755
index 0000000..7a12473
--- /dev/null
+++ b/build-avx2.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#if [ "$OS" = "Windows_NT" ]; then
+#    ./mingw64.sh
+#    exit 0
+#fi
+
+# Linux build
+
+make distclean || echo clean
+
+rm -f config.status
+./autogen.sh || echo done
+
+# Ubuntu 10.04 (gcc 4.4)
+# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
+
+# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
+#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
+
+#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
+CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+
+make -j 4
+
+strip -s cpuminer
diff --git a/clean-all.sh b/clean-all.sh
new file mode 100755
index 0000000..6a908ee
--- /dev/null
+++ b/clean-all.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#
+# imake clean and rm all the targetted executables.
+# tips to users.
+
+rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen  > /dev/null
+
+rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-sse2.exe cpuminer-zen.exe  > /dev/null
+
+make distclean
diff --git a/configure b/configure
index 3a5454b..9649ea1 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.6.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.7.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.6'
-PACKAGE_STRING='cpuminer-opt 3.10.6'
+PACKAGE_VERSION='3.10.7'
+PACKAGE_STRING='cpuminer-opt 3.10.7'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.7 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.7:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.10.6
+cpuminer-opt configure 3.10.7
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.10.6, which was
+It was created by cpuminer-opt $as_me 3.10.7, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.10.6'
+ VERSION='3.10.7'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.6, which was
+This file was extended by cpuminer-opt $as_me 3.10.7, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.6
+cpuminer-opt config.status 3.10.7
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 613de42..ad2b31d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.6])
+AC_INIT([cpuminer-opt], [3.10.7])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index f7f8968..c2d7720 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -27,6 +27,9 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 #sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
 
 # make release directory and copy selected DLLs.
+
+rm -rf release > /dev/null
+
 mkdir release
 cp README.txt release/
 cp README.md release/
@@ -35,10 +38,6 @@ cp $MINGW_LIB/zlib1.dll release/
 cp $MINGW_LIB/libwinpthread-1.dll release/
 cp $GCC_MINGW_LIB/libstdc++-6.dll release/
 cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
-#cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
-#cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
-#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
-#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
 cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
 cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/